diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1594e798a2ba3f735a28a43ef933d80b3b3f8964..cb646d3ce5d660734a27c0ac9f18ad54cd3e1c1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,7 @@ option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
+option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
@@ -131,8 +132,6 @@ if (APPLE OR WIN32)
 endif()
 
 if (WIN32)
-    set(WITH_AVX OFF CACHE STRING
-            "Disable AVX when compiling for Windows" FORCE)
     set(WITH_DSO OFF CACHE STRING
             "Disable DSO when compiling for Windows" FORCE)
     set(WITH_MKL OFF CACHE STRING
@@ -217,6 +216,12 @@ include(cupti)
 include(external/gzstream)
 endif (NOT WIN32)
 
+if(WITH_PSLIB)
+    include(external/libmct)
+    include(external/pslib_brpc)
+    include(external/pslib)
+endif(WITH_PSLIB)
+
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
         include(external/grpc)
@@ -284,6 +289,12 @@ set(EXTERNAL_LIBS
     ${PYTHON_LIBRARIES}
 )
 
+if(WITH_PSLIB)
+    list(APPEND EXTERNAL_LIBS pslib)
+    list(APPEND EXTERNAL_LIBS pslib_brpc)
+    list(APPEND EXTERNAL_LIBS libmct)
+endif(WITH_PSLIB)
+    
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 51f7a61631d7102b60646abe1c6dd7775692f157..4ee2fdcf2db6bfa373f814ee4c0ab4d708486ea8 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -84,6 +84,10 @@ if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
 
+if(WITH_PSLIB)
+    add_definitions(-DPADDLE_WITH_PSLIB)
+endif()
+
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..27cff8cfb6315c9b4fa5677ad9062bee73a0e5d8
--- /dev/null
+++ b/cmake/external/libmct.cmake
@@ -0,0 +1,78 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_LIBMCT})
+  return()
+ENDIF(NOT ${WITH_LIBMCT})
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with LIBMCT in Paddle yet."
+        "Force WITH_LIBMCT=OFF")
+    SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(LIBMCT_PROJECT       "extern_libmct")
+IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
+  SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
+  SET(LIBMCT_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${LIBMCT_VER}/${LIBMCT_NAME}.tar.gz" CACHE STRING "" FORCE) 
+ENDIF()
+MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
+SET(LIBMCT_SOURCE_DIR    "${THIRD_PARTY_PATH}/libmct")
+SET(LIBMCT_DOWNLOAD_DIR  "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}")
+SET(LIBMCT_DST_DIR       "libmct")
+SET(LIBMCT_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(LIBMCT_INSTALL_DIR   ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR})
+SET(LIBMCT_ROOT          ${LIBMCT_INSTALL_DIR})
+SET(LIBMCT_INC_DIR       ${LIBMCT_ROOT}/include)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR})
+
+FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(LIBMCT)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n"
+  "        DESTINATION ${LIBMCT_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${LIBMCT_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${LIBMCT_SOURCE_DIR}
+    DOWNLOAD_DIR          ${LIBMCT_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz
+                          && tar zxvf ${LIBMCT_NAME}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT}
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+    add_library(libmct STATIC ${dummyfile})
+else()
+    add_library(libmct INTERFACE)
+endif()
+
+#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
+ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
+LIST(APPEND external_project_dependencies libmct)
+
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..3b495d78e2c61f90418adbc5746792bc6e49d90b
--- /dev/null
+++ b/cmake/external/pslib.cmake
@@ -0,0 +1,77 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_PSLIB})
+  return()
+ENDIF(NOT ${WITH_PSLIB})
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with PSLIB in Paddle yet."
+        "Force WITH_PSLIB=OFF")
+    SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(PSLIB_PROJECT       "extern_pslib")
+IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) 
+  SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) 
+  SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE) 
+ENDIF()
+MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
+SET(PSLIB_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib")
+SET(PSLIB_DOWNLOAD_DIR  "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}")
+SET(PSLIB_DST_DIR       "pslib")
+SET(PSLIB_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(PSLIB_INSTALL_DIR   ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR})
+SET(PSLIB_ROOT          ${PSLIB_INSTALL_DIR})
+SET(PSLIB_INC_DIR       ${PSLIB_ROOT}/include)
+SET(PSLIB_LIB_DIR       ${PSLIB_ROOT}/lib)
+SET(PSLIB_LIB           ${PSLIB_LIB_DIR}/libps.so)
+SET(PSLIB_IOMP_LIB      ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${PSLIB_INC_DIR})
+
+FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(PSLIB)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n"
+  "        DESTINATION ${PSLIB_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${PSLIB_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${PSLIB_SOURCE_DIR}
+    DOWNLOAD_DIR          ${PSLIB_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz
+                          && tar zxvf ${PSLIB_NAME}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
+ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
+LIST(APPEND external_project_dependencies pslib)
+
+IF(WITH_C_API)
+  INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7ff5a8aca187240108164900638f5a376e9fbc93
--- /dev/null
+++ b/cmake/external/pslib_brpc.cmake
@@ -0,0 +1,77 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_PSLIB_BRPC})
+  return()
+ENDIF(NOT ${WITH_PSLIB_BRPC})
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet."
+        "Force WITH_PSLIB_BRPC=OFF")
+    SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(PSLIB_BRPC_PROJECT       "extern_pslib_brpc")
+IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE)
+  SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE)
+  SET(PSLIB_BRPC_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_BRPC_VER}/${PSLIB_BRPC_NAME}.tar.gz" CACHE STRING "" FORCE)
+ENDIF()
+MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}")
+SET(PSLIB_BRPC_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib_brpc")
+SET(PSLIB_BRPC_DOWNLOAD_DIR  "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}")
+SET(PSLIB_BRPC_DST_DIR       "pslib_brpc")
+SET(PSLIB_BRPC_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(PSLIB_BRPC_INSTALL_DIR   ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR})
+SET(PSLIB_BRPC_ROOT          ${PSLIB_BRPC_INSTALL_DIR})
+SET(PSLIB_BRPC_INC_DIR       ${PSLIB_BRPC_ROOT}/include)
+SET(PSLIB_BRPC_LIB_DIR       ${PSLIB_BRPC_ROOT}/lib)
+SET(PSLIB_BRPC_LIB           ${PSLIB_BRPC_LIB_DIR}/libbrpc.a)
+SET(PSLIB_BRPC_IOMP_LIB      ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR})
+
+FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(PSLIB_BRPC)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n"
+  "        DESTINATION ${PSLIB_BRPC_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${PSLIB_BRPC_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${PSLIB_BRPC_SOURCE_DIR}
+    DOWNLOAD_DIR          ${PSLIB_BRPC_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz
+                          && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
+ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
+LIST(APPEND external_project_dependencies pslib_brpc)
+
+IF(WITH_C_API)
+  INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index a3599dd798c07f57ed82e3f25b6bb9fc4f8bdc3a..623c53f4f75bbd217c157bcdda0cb12c510269ee 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,8 +18,8 @@ ENDIF()
 
 INCLUDE(python_module)
 
-FIND_PACKAGE(PythonInterp ${PY_VERSION})
-FIND_PACKAGE(PythonLibs ${PY_VERSION})
+FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED)
+FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
 
 if(WIN32)
     execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
@@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND)
         "please use pip to upgrade protobuf. pip install -U protobuf")
     ENDIF()
 ENDIF(PYTHONINTERP_FOUND)
-
 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index b30403d2d81ce471f39b4d92e24a500fe41eeebb..f9d4cd97400a68e613e3dd5467191a0d42a9942e 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -24,12 +24,6 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
 
-if (WIN32)
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
-else(WIN32)
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
-endif (WIN32)
-
 ExternalProject_Add(
     extern_snappy
     GIT_REPOSITORY "https://github.com/google/snappy"
@@ -56,6 +50,16 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+    IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+        add_custom_command(TARGET extern_snappy POST_BUILD
+                COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib
+                )
+    ENDIF()
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)
 
 add_library(snappy STATIC IMPORTED GLOBAL)
 set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index 4c2d64f627401071098e72bfb930fb5d62fa042d..c3e1212d8f8358e0148b5e00223414c9696686ee 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -56,7 +56,12 @@ else()
 endif()
 
 if (WIN32)
-  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
+  IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
+    add_custom_command(TARGET extern_xxhash POST_BUILD
+            COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib
+            )
+  ENDIF()
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
 else()
   set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
 endif ()
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index c3d73235453c8c9fd2859c3ab142888e8bda2dbe..d35073753725cd5772de3fc7a23af5ba69a65558 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -19,12 +19,6 @@ SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
 SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
 SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
 
-IF(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
-ELSE(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
-ENDIF(WIN32)
-
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
 INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
 
@@ -49,6 +43,16 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+  IF(NOT EXISTS "${ZLIB_INSTALL_DIR}/lib/libz.lib")
+    add_custom_command(TARGET extern_zlib POST_BUILD
+            COMMAND cmake -E copy ${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib ${ZLIB_INSTALL_DIR}/lib/libz.lib
+            )
+  ENDIF()
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.lib" CACHE FILEPATH "zlib library." FORCE)
+ELSE(WIN32)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+ENDIF(WIN32)
 
 ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c679d8507d8a9d3bce48b7f38491dadd9f2fb7f6..9f0adef7aa603ec5a3c8a5aa347613f462c43e60 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -32,24 +32,35 @@ function(copy TARGET)
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
         if (WIN32)
-            # windows cmd shell will not expand wildcard automatically.
-            # below expand the files,libs and copy them by rules.
-            file(GLOB header_files ${src} "*.h")
-            file(GLOB static_lib_files ${src} "*.lib")
-            file(GLOB dll_lib_files ${src} "*.dll")
-            set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
-
-            if (NOT "${src_files}" STREQUAL "")
-                list(REMOVE_DUPLICATES src_files)
-            endif ()
-            add_custom_command(TARGET ${TARGET} PRE_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
-                    )
-            foreach (src_file ${src_files})
+            if(IS_DIRECTORY ${src})
+                get_filename_component(last_path ${src} NAME)
+                string(APPEND dst "/" ${last_path})
                 add_custom_command(TARGET ${TARGET} PRE_BUILD
-                        COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
-                        COMMENT "copying ${src_file} -> ${dst}")
-            endforeach ()
+                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+                        )
+                if(EXISTS ${src})
+                    add_custom_command(TARGET ${TARGET} PRE_BUILD
+                            COMMAND cmake -E copy_directory "${src}" "${dst}"
+                            COMMENT "copying ${src} -> ${dst}")
+                else()
+                    message(WARNING "${src} not exist!")
+                endif()
+            else()
+                # windows cmd shell will not expand wildcard automatically.
+                # below expand the files, and copy them by rules.
+                file(GLOB src_files ${src})
+                if (NOT "${src_files}" STREQUAL "")
+                    list(REMOVE_DUPLICATES src_files)
+                endif ()
+                add_custom_command(TARGET ${TARGET} PRE_BUILD
+                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+                        )
+                foreach (src_file ${src_files})
+                    add_custom_command(TARGET ${TARGET} PRE_BUILD
+                            COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
+                            COMMENT "copying ${src_file} -> ${dst}")
+                endforeach ()
+            endif()
         else (WIN32) # not windows
             add_custom_command(TARGET ${TARGET} PRE_BUILD
                     COMMAND mkdir -p "${dst}"
@@ -95,7 +106,7 @@ copy(xxhash_lib
         DEPS xxhash
         )
 
-if (NOT PROTOBUF_FOUND)
+if (NOT PROTOBUF_FOUND OR WIN32)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
             SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
@@ -138,27 +149,25 @@ if (WITH_NGRAPH)
             )
 endif ()
 
-if (NOT WIN32)
-    if (NOT MOBILE_INFERENCE AND NOT RPI)
-        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
-        copy(snappy_lib
-                SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-                DSTS ${dst_dir} ${dst_dir}/lib
-                DEPS snappy)
+if (NOT MOBILE_INFERENCE AND NOT RPI)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+    copy(snappy_lib
+            SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib
+            DEPS snappy)
 
-        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
-        copy(snappystream_lib
-                SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-                DSTS ${dst_dir} ${dst_dir}/lib
-                DEPS snappystream)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+    copy(snappystream_lib
+            SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib
+            DEPS snappystream)
 
-        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
-        copy(zlib_lib
-                SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-                DSTS ${dst_dir} ${dst_dir}/lib
-                DEPS zlib)
-    endif ()
-endif (NOT WIN32)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+    copy(zlib_lib
+            SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib
+            DEPS zlib)
+endif ()
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
@@ -191,9 +200,21 @@ if (WITH_ANAKIN AND WITH_MKL)
     list(APPEND inference_deps anakin_inference_lib)
 endif ()
 
+if (TENSORRT_FOUND)
+    copy(tensorrt_lib DEPS ${inference_deps} 
+        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
+        DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
+endif ()
+
+
 set(module "inference")
+if(WIN32)
+    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*)
+else(WIN32)
+    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
+endif(WIN32)
 copy(inference_lib DEPS ${inference_deps}
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib}
        ${src_dir}/${module}/api/paddle_*.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
         )
@@ -233,7 +254,7 @@ copy(third_party DEPS fluid_lib_dist
 
 # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
 copy(inference_api_lib DEPS fluid_lib_dist
-  SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  SRCS ${paddle_fluid_lib}
        ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
   DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
 )
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 07c6b964aa94b3cb11e9a26f2ca1d9ab75af6abe..5e9901bb87c9a454a393a913b6da6e82266ee719 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -37,8 +37,16 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non
 paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, ''))
+paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12))
+paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
+paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
@@ -201,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cea4a448574cfa822ef1498cc3ea1d58a7aea43f..412bc9cbe88b860a698d17e239d7b94d8956b781 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,17 +1,18 @@
 
-# windows treat symbolic file as a real file, which is different with unix
-# We create a hidden file and compile it instead of origin source file.
+#windows treat symbolic file as a real file, which is different with unix
+#We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs SRCS PATH)
   cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
   foreach(src ${windows_symbolic_SRCS})
   get_filename_component(src ${src} NAME_WE)
   if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
       message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
   endif()
 
-  # only copy the xx.cu to .xx.cu when the content are modified
+#only copy the xx.cu to.xx.cu when the content are modified
   set(copy_flag 1)
   if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
   file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
@@ -32,7 +33,7 @@ endfunction()
 
 add_subdirectory(ir)
 add_subdirectory(details)
-# ddim lib
+#ddim lib
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)
 
@@ -91,8 +92,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
 
 if(WITH_GPU)
   if (WIN32)
-    # windows treat symbolic file as a real file, which is different with unix
-    # We create a hidden file and compile it instead of origin source file.
+#windows treat symbolic file as a real file, which is different with unix
+#We create a hidden file and compile it instead of origin source file.
       windows_symbolic(hidden_file SRCS data_type_transform.cu)
       nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
       add_dependencies(data_type_transform hidden_file)
@@ -143,7 +144,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
-# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+#Generate an empty \
+    #__init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 if (NOT WIN32)
@@ -195,7 +197,12 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         graph build_strategy
         fast_threaded_ssa_graph_executor variable_helper)
 
-cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+if(WITH_PSLIB)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib)
+else()
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+endif(WITH_PSLIB)
+
 
 cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index afb2dd2f064384da39904f6aceead4fa915a80f2..ee3c5e01f87eeb123f43f867296e35cc8adb7e8e 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -29,6 +29,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
+#ifdef PADDLE_WITH_PSLIB
+#include <pslib.h>
+#endif
 
 namespace paddle {
 namespace framework {
@@ -47,6 +50,11 @@ void AsyncExecutor::CreateThreads(
   worker->SetDataFeed(reader);
   worker->SetFetchVarNames(fetch_var_names);
   worker->BindingDataFeedMemory();
+#ifdef PADDLE_WITH_PSLIB
+  worker->SetPSlibPtr(_pslib_ptr);
+  worker->SetPullDenseThread(_pull_dense_thread);
+  worker->SetParamConfig(&_param_config);
+#endif
 }
 
 void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
@@ -60,12 +68,177 @@ void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
   readers[0]->SetFileList(filelist);
 }
 
+#ifdef PADDLE_WITH_PSLIB
+void AsyncExecutor::InitServer(const std::string& dist_desc, int index) {
+  _pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
+      new paddle::distributed::PSlib());
+  _pslib_ptr->init_server(dist_desc, index);
+  InitParamConfig();
+}
+
+void AsyncExecutor::InitWorker(const std::string& dist_desc,
+                               const std::vector<uint64_t>& host_sign_list,
+                               int node_num, int index) {
+  _pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
+      new paddle::distributed::PSlib());
+  _pslib_ptr->init_worker(
+      dist_desc, const_cast<uint64_t*>(host_sign_list.data()), node_num, index);
+
+  InitParamConfig();
+}
+
+uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); }
+
+void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); }
+
+void AsyncExecutor::GatherServers(const std::vector<uint64_t>& host_sign_list,
+                                  int node_num) {
+  _pslib_ptr->gather_servers(const_cast<uint64_t*>(host_sign_list.data()),
+                             node_num);
+}
+
+void AsyncExecutor::InitParamConfig() {
+  for (int i = 0; i < _pslib_ptr->get_param()
+                          ->server_param()
+                          .downpour_server_param()
+                          .downpour_table_param_size();
+       ++i) {
+    if (_pslib_ptr->get_param()
+            ->server_param()
+            .downpour_server_param()
+            .downpour_table_param(i)
+            .table_class()
+            .find("SparseTable") != -1) {
+      _param_config.fea_dim = _pslib_ptr->get_param()
+                                  ->server_param()
+                                  .downpour_server_param()
+                                  .downpour_table_param(i)
+                                  .accessor()
+                                  .fea_dim();
+      break;
+    }
+  }
+  _param_config.slot_dim = _param_config.fea_dim - 2;
+  _param_config.tmp_push_dense_wait_times = static_cast<int32_t>(
+      _pslib_ptr->get_param()->trainer_param().push_dense_per_batch());
+  _param_config.tmp_push_sparse_wait_times = static_cast<int32_t>(
+      _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch());
+
+  for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size();
+       ++t) {
+    _param_config.skip_op.push_back(
+        _pslib_ptr->get_param()->trainer_param().skip_op(t));
+  }
+
+  for (auto t = 0u;
+       t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) {
+    auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t);
+    std::vector<std::string> tmp_sparse_variable_name;
+    for (int i = 0u; i < table.slot_value_size(); ++i) {
+      tmp_sparse_variable_name.push_back(table.slot_value(i));
+      _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id();
+    }
+    std::vector<std::string> tmp_sparse_gradient_variable_name;
+    for (auto i = 0u; i < table.slot_gradient_size(); ++i) {
+      tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i));
+    }
+    _param_config.slot_input_vec[table.table_id()] =
+        std::move(tmp_sparse_variable_name);
+    _param_config.gradient_var[table.table_id()] =
+        std::move(tmp_sparse_gradient_variable_name);
+    _param_config.sparse_table_id.push_back(table.table_id());
+  }
+
+  for (auto t = 0u;
+       t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) {
+    auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t);
+    std::vector<std::string> tmp_dense_variable_name;
+    for (int i = 0u; i < table.dense_variable_name_size(); ++i) {
+      tmp_dense_variable_name.push_back(table.dense_variable_name(i));
+    }
+    std::vector<std::string> tmp_dense_gradient_variable_name;
+    for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) {
+      tmp_dense_gradient_variable_name.push_back(
+          table.dense_gradient_variable_name(i));
+    }
+    _param_config.dense_variable_name[table.table_id()] =
+        std::move(tmp_dense_variable_name);
+    _param_config.dense_gradient_variable_name[table.table_id()] =
+        std::move(tmp_dense_gradient_variable_name);
+    _param_config.dense_table_id.push_back(table.table_id());
+    _param_config.dense_table_size.push_back(table.fea_dim());
+  }
+}
+
+void AsyncExecutor::InitModel() {
+  for (auto table_id : _param_config.dense_table_id) {
+    std::vector<paddle::ps::Region> regions;
+    for (auto& t : _param_config.dense_variable_name[table_id]) {
+      Variable* var = root_scope_->FindVar(t);
+      CHECK(var != nullptr) << "var[" << t << "] not found";
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+
+      float* g = tensor->data<float>();
+      CHECK(g != nullptr) << "var[" << t << "] value not initialized";
+
+      float init_range = 0.2;
+      int rown = tensor->dims()[0];
+      init_range /= sqrt(rown);
+
+      std::normal_distribution<float> ndistr(0.0, 1.0);
+      for (auto i = 0u; i < tensor->numel(); ++i) {
+        g[i] = ndistr(local_random_engine()) * init_range;
+      }
+
+      paddle::ps::Region reg(g, tensor->numel());
+      regions.emplace_back(std::move(reg));
+    }
+
+    auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(
+        regions.data(), regions.size(), table_id);
+    push_status.wait();
+    auto status = push_status.get();
+    if (status != 0) {
+      LOG(FATAL) << "push dense param failed, status[" << status << "]";
+      exit(-1);
+    }
+  }
+}
+
+void AsyncExecutor::SaveModel(const std::string& path) {
+  auto ret = _pslib_ptr->_worker_ptr->flush();
+  ret.wait();
+  ret = _pslib_ptr->_worker_ptr->save(path, 0);
+  ret.wait();
+  int32_t feasign_cnt = ret.get();
+  if (feasign_cnt == -1) {  // (colourful-tree) TODO should be feasign_cnt < 0
+    LOG(FATAL) << "save model failed";
+    exit(-1);
+  }
+}
+
+void AsyncExecutor::PrepareDenseThread(const std::string& mode) {
+  if (mode == "mpi") {
+    DensePullThreadParam param;
+    param.ps_client = _pslib_ptr->_worker_ptr;
+    param.threshold = 1;
+    param.training_thread_num = actual_thread_num;
+    param.root_scope = root_scope_;
+    param.dense_params = &_param_config.dense_variable_name;
+
+    _pull_dense_thread =
+        std::shared_ptr<DensePullThread>(new DensePullThread(param));
+    _pull_dense_thread->start();
+  }
+}
+#endif
+
 void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
                                 const std::string& data_feed_desc_str,
                                 const std::vector<std::string>& filelist,
                                 const int thread_num,
                                 const std::vector<std::string>& fetch_var_names,
-                                const bool debug) {
+                                const std::string& mode, const bool debug) {
   std::vector<std::thread> threads;
 
   auto& block = main_program.Block(0);
@@ -82,7 +255,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
                                                 &data_feed_desc);
 
-  int actual_thread_num = thread_num;
+  actual_thread_num = thread_num;
   int file_cnt = filelist.size();
   PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
 
@@ -106,11 +279,21 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   // todo: should be factory method for creating datafeed
   std::vector<std::shared_ptr<DataFeed>> readers;
   PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
-
+#ifdef PADDLE_WITH_PSLIB
+  PrepareDenseThread(mode);
+#endif
   std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
   workers.resize(actual_thread_num);
   for (auto& worker : workers) {
+#ifdef PADDLE_WITH_PSLIB
+    if (mode == "mpi") {
+      worker.reset(new AsyncExecutorThreadWorker);
+    } else {
+      worker.reset(new ExecutorThreadWorker);
+    }
+#else
     worker.reset(new ExecutorThreadWorker);
+#endif
   }
 
   // prepare thread resource here
@@ -128,7 +311,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   for (auto& th : threads) {
     th.join();
   }
-
+#ifdef PADDLE_WITH_PSLIB
+  if (mode == "mpi") {
+    _pull_dense_thread->stop();
+  }
+#endif
   root_scope_->DropKids();
 
   return;
diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h
index f4d2a79ac592e02f49ec0b988c824dc98883fbf6..95c8472b2f3b6b0c2d95fcf0c0b6f00e7f39b032 100644
--- a/paddle/fluid/framework/async_executor.h
+++ b/paddle/fluid/framework/async_executor.h
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <time.h>
 #include <map>
 #include <memory>
-#include <mutex>  // NOLINT
+#include <mutex>   // NOLINT
+#include <random>  // local_random_engine
 #include <set>
 #include <string>
 #include <thread>  // NOLINT
@@ -30,6 +32,31 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+inline double current_realtime() {
+#if !defined(_WIN32)
+  struct timespec tp;
+  clock_gettime(CLOCK_REALTIME, &tp);
+  return tp.tv_sec + tp.tv_nsec * 1e-9;
+#else
+  return 0.0;
+#endif
+}
+
+inline std::default_random_engine& local_random_engine() {
+  struct engine_wrapper_t {
+    std::default_random_engine engine;
+    engine_wrapper_t() {
+      static std::atomic<uint64_t> x(0);
+      std::seed_seq sseq = {x++, x++, x++,
+                            static_cast<uint64_t>(current_realtime() * 1000)};
+      engine.seed(sseq);
+    }
+  };
+  thread_local engine_wrapper_t r;
+  return r.engine;
+}
+
 class AsyncExecutor {
  public:
   AsyncExecutor(Scope* scope, const platform::Place& place);
@@ -39,7 +66,19 @@ class AsyncExecutor {
                    const std::vector<std::string>& filelist,
                    const int thread_num,
                    const std::vector<std::string>& fetch_names,
-                   const bool debug = false);
+                   const std::string& mode, const bool debug = false);
+#ifdef PADDLE_WITH_PSLIB
+  void InitServer(const std::string& dist_desc, int index);
+  void InitWorker(const std::string& dist_desc,
+                  const std::vector<uint64_t>& host_sign_list, int node_num,
+                  int index);
+  uint64_t StartServer();
+  void StopServer();
+  void GatherServers(const std::vector<uint64_t>& host_sign_list, int node_num);
+  void InitModel();
+  void SaveModel(const std::string& path);
+  void InitParamConfig();
+#endif
 
  private:
   void CreateThreads(ExecutorThreadWorker* worker,
@@ -48,10 +87,21 @@ class AsyncExecutor {
                      const std::vector<std::string>& fetch_var_names,
                      Scope* root_scope, const int thread_index,
                      const bool debug);
+#ifdef PADDLE_WITH_PSLIB
+  void PrepareDenseThread(const std::string& mode);
+#endif
 
  public:
+#ifdef PADDLE_WITH_PSLIB
+  std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
+  std::shared_ptr<DensePullThread> _pull_dense_thread;
+  AsyncWorkerParamConfig _param_config;
+#endif
   Scope* root_scope_;
   platform::Place place_;
+
+ private:
+  int actual_thread_num;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index a99cf53b410433c6e4b8a19821779f28c25e678f..41155cfb7714b10fa51bc56fc90af4ee3d8b4a1a 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -64,6 +64,7 @@ bool DataFeed::PickOneFile(std::string* filename) {
     return false;
   }
   *filename = filelist_[file_idx_++];
+  LOG(ERROR) << "pick file:" << *filename;
   return true;
 }
 
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 97f7713d97494b1d6b11ef77796fbf4f75e598dc..63a68ba3a5c289be7c2d352717fe5911539df8a7 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
+cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-
+cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
+        all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
@@ -63,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass)
+if (WITH_GPU)
+  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
+endif()
+cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph)
+cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass)
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 
@@ -84,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse
 cc_library(build_strategy SRCS build_strategy.cc DEPS
         graph_viz_pass multi_devices_graph_pass
         multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass multi_batch_merge_pass)
+        fuse_elewise_add_act_pass multi_batch_merge_pass
+        memory_optimize_pass)
diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/analysis_var_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..223b9da3cfba33fc32d1334cddccb9f503bd0bef
--- /dev/null
+++ b/paddle/fluid/framework/details/analysis_var_pass.cc
@@ -0,0 +1,656 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/analysis_var_pass.h"
+#include <algorithm>
+#include <atomic>
+#include <deque>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+DEFINE_bool(enable_subgraph_optimize, false,
+            "SubGraph also reuse global graph variables, it will reduce the "
+            "memory occupation"
+            "but a higher risk of memory reuse error. default disabled.");
+DEFINE_string(memory_optimize_debug, "",
+              "debug the operator output variable when do the variable reuse."
+              "memory reuse pass."
+              "only for debug, default disabled.");
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+template <typename Container, typename Callback>
+class FilterVariableImpl {
+ public:
+  void operator()(const Container& nodes, Callback callback) {
+    for (auto* node : nodes) {
+      callback(node);
+    }
+  }
+};
+
+// filter var node for op->inputs/outputs
+template <typename Callback>
+class FilterVariableImpl<std::vector<ir::Node*>, Callback> {
+ public:
+  void operator()(const std::vector<ir::Node*>& nodes, Callback callback) {
+    for (auto* var : nodes) {
+      if (var->IsVar() && !var->IsCtrlVar()) {
+        callback(var);
+      }
+    }
+  }
+};
+
+template <typename Container, typename Callback>
+void FilterVariables(const Container& nodes, Callback callback) {
+  FilterVariableImpl<Container, Callback>()(nodes, callback);
+}
+
+std::unique_ptr<ir::Graph> AnalysisVarPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto nodes = graph->Nodes();
+  auto subblock_vars = GetSubBlockVars(nodes);
+  skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
+
+  cfg_.reset(new details::ControlFlowGraph(*graph));
+  cfg_->LiveVariableAnalysis();
+  InitSSAGraphNodes();
+
+  int reuse_id = 0;
+  for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) {
+    auto& op = cfg_->Ops()[idx];
+    auto* op_desc = op->Op();
+    // some op in graph has no op desc
+    if (op_desc == nullptr) continue;
+    if (OpHasSubBlock(op_desc)) {
+      if (FLAGS_enable_subgraph_optimize) {
+        SubGraphOptimize(op_desc);
+      } else {
+        VLOG(3) << op->Name()
+                << " has subblock, but disable subgraph optimize. skipped.";
+        continue;
+      }
+    }
+
+    for (auto& var : op->outputs) {
+      if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
+        ir::Node* cache = pool_.NodeMatch(var);
+        if (var->Name() == FLAGS_memory_optimize_debug) {
+          VLOG(3) << "start match var " << DebugString(var) << " of op "
+                  << op->Name();
+          VLOG(3) << pool_.ToString();
+          VLOG(3) << "matched in pool : "
+                  << ((cache == nullptr) ? "False" : "True");
+        }
+        if (cache != nullptr) {
+          if (var->Name() == cache->Name()) {
+            VLOG(3) << "The same cache variable is cascade reused."
+                    << var->Name() << " is re-filled to the pool after"
+                    << "the reused op is finished. Current op can not "
+                    << "replace it again. Skip this candidate.";
+            continue;
+          }
+
+          int node_idx_in_pool = pool_.GetIndex(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+              node_idx_in_pool, static_cast<int>(pool_.size()));
+          // update CFG Graph on the fly.
+          // reused var maybe re-fill into the pool
+          cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
+          // NOTE(dzhwinter): we need to both update the ProgramDesc
+          // and IR Graph. because op_desc/var_desc is used in CreateOp,
+          // CreateVar when running happens. But IR Graph
+          // define the dependence relationship between nodes.
+          RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
+          RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
+
+          pool_.Erase(cache);
+        }
+      }
+    }
+    // fill the pool
+    for (auto var : cfg_->LiveIn(op)) {
+      if (cfg_->LiveOut(op).count(var) == 0) {
+        ir::Node* var_node = cfg_->GetNodeFromVarName(var, op);
+        if (var_node == nullptr) continue;
+        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
+          pool_.Insert(var_node, op);
+        }
+      }
+    }
+  }
+  graph->ResolveHazard(var_nodes_);
+
+  // For early delete pass. use GraphNodePool load the unlived vars.
+  // 1. find all deps op for each unlived var in memory pool.
+  for (auto& op : graph->Nodes()) {
+    for (auto& var : op->inputs) {
+      if (pool_.Has(var)) {
+        pool_.Insert(var, op);
+      }
+    }
+  }
+  // 2. convert ir node based memory pool to graph node
+  // because Node* maybe released bettwen passes.
+  auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
+  for (auto it = pool_.begin(); it != pool_.end(); ++it) {
+    std::unordered_set<OpDesc*> descs;
+    for (auto& op : it->second) {
+      PADDLE_ENFORCE(op->IsOp());
+      descs.insert(op->Op());
+    }
+    graph_pool.push_back(std::make_pair(it->first->Name(), descs));
+  }
+
+  return graph;
+}
+
+void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const {
+  // conditional block, while op and their grad op
+  auto* sub_block_desc =
+      AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block");
+
+  // create a mirror block to construct an IR Graph.
+  ProgramDesc prog;
+  auto* copy_block = prog.MutableBlock(0);
+  for (auto* op : sub_block_desc->AllOps()) {
+    auto* copy_op = copy_block->AppendOp();
+    copy_op->CopyFrom(*op);
+    copy_op->Flush();
+  }
+
+  for (auto* var : sub_block_desc->AllVars()) {
+    auto* copy_var = copy_block->Var(var->Name());
+    copy_var->SetDataType(var->GetDataType());
+    // only lod tensor can be reused. So ignore the multiple dims case.
+    copy_var->SetType(var->GetType());
+    copy_var->SetShape(var->GetShape());
+    copy_var->SetPersistable(var->Persistable());
+  }
+
+  ir::Graph sub_graph(prog);
+  std::unordered_set<ir::Node*> sub_graph_all_ops;
+  FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) {
+    // sub_graph_all_ops.emplace(var);
+    if (var->IsVar() && !var->IsCtrlVar()) {
+      sub_graph_all_ops.emplace(var);
+    }
+  });
+  int sub_reuse_id = 0;
+  // subgraph nodes is unordered, reuse need to follow the desc order.
+  // find the right op node through the descs
+  for (auto* sub_op_desc : sub_block_desc->AllOps()) {
+    ir::Node* sub_op = nullptr;
+    for (auto* node : sub_graph_all_ops) {
+      if (node->Op() == sub_op_desc) {
+        sub_op = node;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(sub_op != nullptr);
+    for (auto* var : sub_op->outputs) {
+      if (NodeCanReused(var)) {
+        ir::Node* cache = pool_.NodeMatch(var);
+        if (cache != nullptr) {
+          if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
+            continue;
+          }
+          int node_idx_in_pool = pool_.GetIndex(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(sub_reuse_id++), DebugString(var),
+              DebugString(cache), node_idx_in_pool,
+              static_cast<int>(pool_.size()));
+          // NOTE(dzh): subblock is not in IR graph. Modify the block_desc
+          // immediately to make the subblock variable reuse strategy take
+          // effect. Because it is a single op in graph. No need to
+          // update the ir nodes.
+          sub_op_desc->Rename(var->Name(), cache->Name());
+          if (sub_op_desc->Block()->HasVar(var->Name())) {
+            sub_op_desc->Block()->RemoveVar(var->Name());
+          }
+        }
+      }
+    }
+  }
+}
+
+std::unordered_set<std::string> AnalysisVarPass::GetSubBlockVars(
+    const std::unordered_set<ir::Node*>& nodes) const {
+  std::unordered_set<std::string> vars;
+  for (auto& op : nodes) {
+    if (!op->IsOp() || op->Op() == nullptr) continue;
+    auto* op_desc = op->Op();
+    if (OpHasSubBlock(op_desc)) {
+      auto inputs = op_desc->InputArgumentNames();
+      auto outputs = op_desc->OutputArgumentNames();
+      vars.insert(inputs.begin(), inputs.end());
+      vars.insert(outputs.begin(), outputs.end());
+    }
+  }
+  return vars;
+}
+
+void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var,
+                                           const std::string& cache_var,
+                                           size_t idx) const {
+  for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
+    auto* op = cfg_->Ops()[i];
+    PADDLE_ENFORCE(op->IsOp() && op->Op());
+    auto* op_desc = op->Op();
+    op_desc->RenameInput(var, cache_var);
+    op_desc->RenameOutput(var, cache_var);
+    if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
+    op_desc->Flush();
+  }
+}
+
+void AnalysisVarPass::InitSSAGraphNodes() const {
+  std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
+  if (var_nodes_.empty()) {
+    for (auto* op : cfg_->Ops()) {
+      for (auto* node : op->inputs) {
+        if (all_vars[node->Name()].count(node) == 0) {
+          all_vars[node->Name()].emplace(node);
+          var_nodes_[node->Name()].emplace_back(node);
+        }
+      }
+      for (auto* node : op->outputs) {
+        if (all_vars[node->Name()].count(node) == 0) {
+          all_vars[node->Name()].emplace(node);
+          var_nodes_[node->Name()].emplace_back(node);
+        }
+      }
+    }
+  }
+}
+
+void AnalysisVarPass::RenameVarInGraphNode(const std::string& var,
+                                           const std::string& cache_var,
+                                           size_t idx, ir::Graph* graph) const {
+  // if replace happens, we need to create a newer version cache_var
+  // but use the same dims/data_type with var.
+  PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
+                 var_nodes_[var].at(0)->Var() != nullptr);
+  std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
+  var_desc->SetName(cache_var);
+
+  for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
+    auto* op = cfg_->Ops()[i];
+
+    // redirect the input to the latest version of cache_var
+    for (auto* node : op->inputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        var_nodes_[cache_var].emplace_back(cache_node);
+
+        // swap node to cache_node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp());
+        auto* prev_op = node->inputs[0];
+        std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
+                     cache_node);
+        cache_node->inputs.emplace_back(prev_op);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+      }
+    }
+
+    // if we need to rename the output,
+    // always create a newer version of cache_var
+    for (auto* node : op->outputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        var_nodes_[cache_var].emplace_back(cache_node);
+
+        // swap node to cache node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        cache_node->inputs.emplace_back(op);
+        std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+      }
+    }
+  }
+
+  // release node of unused var in graph
+  for (auto* node : var_nodes_[var]) {
+    graph->RemoveNode(node);
+  }
+  var_nodes_.at(var).clear();
+}
+
+bool AnalysisVarPass::NodeCanReused(ir::Node* node) const {
+  if (!node->IsVar() || node->IsCtrlVar()) return false;
+  auto* desc = node->Var();
+  auto type = desc->GetType();
+  if (desc->Persistable() || type != proto::VarType::LOD_TENSOR ||
+      desc->GetShape().empty()) {
+    return false;
+  }
+  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
+  std::string name = node->Name();
+  if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
+    return false;
+  if (skip_set_.count(name)) return false;
+  for (auto* op : node->inputs) {
+    if (op->Op()->HasAttr("force_cpu")) {
+      // op output force generated in cpu, can not be reused.
+      return framework::AttrReader(op->Op()->GetAttrMap())
+                 .Get<bool>("force_cpu") == 0;
+    }
+  }
+  return true;
+}
+
+bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const {
+  const AttributeMap& attrs = desc->GetAttrMap();
+  for (auto& attr : attrs) {
+    if (attr.second.type() == typeid(BlockDesc*) ||             // NOLINT
+        attr.second.type() == typeid(std::vector<BlockDesc*>))  // NOLINT
+      return true;
+  }
+  return false;
+}
+
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
+  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
+                 "Graph has no attribute of kAllOpDescs.");
+  // 1. get op desc order
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+
+  // 2. topology sort order
+  auto nodes = graph.Nodes();
+  std::deque<ir::Node*> ops;
+  FilterVariables(nodes, [&](ir::Node* op) {
+    if (op->IsOp() && op->Op() != nullptr) {
+      ops.emplace_back(op);
+    }
+  });
+  std::unordered_map<ir::Node*, size_t> op_deps;
+  std::list<ir::Node*> ready_ops;
+  std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
+
+  for (auto* op : ops) {
+    std::unordered_set<ir::Node*> preceding_op;
+    for (auto* in : op->inputs) {
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
+      preceding_op.emplace(in->inputs[0]);
+      pending_ops[in->inputs[0]].emplace(op);
+    }
+    op_deps[op] = preceding_op.size();
+    if (preceding_op.empty()) {
+      ready_ops.emplace_back(op);
+    }
+  }
+
+  // 3. generated op list based desc order and the topology order
+  std::vector<ir::Node*> ret;
+  std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
+
+  auto update_by_found_node = [&](ir::Node* found_node) {
+    for (auto* pending_op : pending_ops[found_node]) {
+      if (--op_deps[pending_op] == 0) {
+        ready_ops.emplace_back(pending_op);
+      }
+    }
+    ready_ops.remove(found_node);
+    ret.emplace_back(found_node);
+  };
+
+  while (!ready_ops.empty()) {
+    bool all_of_ready_op_unmatched = true;
+    for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
+      auto op_desc = *it;
+      ir::Node* found_node = nullptr;
+      for (auto* op : ready_ops) {
+        if (IsSameDesc(op->Op(), op_desc)) {
+          found_node = op;
+          break;
+        }
+      }
+
+      // 3.1 op desc deleted by other pass
+      if (found_node == nullptr) {
+        ++it;
+        continue;
+      } else {
+        all_of_ready_op_unmatched = false;
+        it = op_descs_list.erase(it);
+      }
+      update_by_found_node(found_node);
+    }
+
+    // 3.2 op descs are added by other pass
+    // preceding op non empty means some new op descs are
+    // created, but not contained in return node list.
+    // these new op desc may depend on each other.
+    std::list<ir::Node*> prev_ready_ops(ready_ops);
+    if (all_of_ready_op_unmatched) {
+      for (auto op : prev_ready_ops) {
+        update_by_found_node(op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE(std::all_of(
+      op_deps.begin(), op_deps.end(),
+      [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
+
+  return ret;
+}
+
+ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
+  ops_ = SortOpLikeDescOrder(graph);
+  ConnectNodes();
+}
+
+void ControlFlowGraph::BuildCFGGraph() {
+  // FIXME(dzh): same effect with ConnectNodes, but use the control
+  // link to build dependency graph, it goes wrong in transformer.
+  for (ir::Node* op : ops_) {
+    for (auto& input_var : op->inputs) {
+      if (!input_var->inputs.empty()) {
+        PADDLE_ENFORCE(
+            input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
+            "Preceding Op Node of Var Node must be unique");
+        auto* pred_op = input_var->inputs[0];
+        if (pred_op->Op() != nullptr) {
+          predecessors_[op].insert(pred_op);
+          successors_[pred_op].insert(op);
+        }
+      }
+      if (input_var->IsVar() && !input_var->IsCtrlVar()) {
+        uses_[op].insert(input_var->Name());
+      }
+    }
+    for (auto& output_var : op->outputs) {
+      // output var may be used by many op
+      for (auto* succ_op : output_var->outputs) {
+        if (succ_op->Op() != nullptr) {
+          successors_[op].insert(succ_op);
+          predecessors_[succ_op].insert(op);
+        }
+      }
+      if (output_var->IsVar() && !output_var->IsCtrlVar()) {
+        defs_[op].insert(output_var->Name());
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::ConnectNodes() {
+  for (size_t i = 0; i < ops_.size(); ++i) {
+    auto& op = ops_[i];
+    try {
+      auto& next_op = ops_.at(i + 1);
+      successors_[op].insert(next_op);
+      predecessors_[next_op].insert(op);
+    } catch (...) {
+      // do nothing
+    }
+
+    FilterVariables(op->inputs,
+                    [&](ir::Node* var) { uses_[op].emplace(var->Name()); });
+
+    FilterVariables(op->outputs,
+                    [&](ir::Node* var) { defs_[op].emplace(var->Name()); });
+  }
+}
+
+void ControlFlowGraph::LiveVariableAnalysis() {
+  // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
+  // compute the liveness of for each variable though reversed_ops algorithm.
+  // It iterates the operators from end to begin, compute the live in/live out
+  // variable set for each op, then the diff between in/out will be used for
+  // the variable reuse. For detail refer to
+  // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
+  std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
+  while (!work_list.empty()) {
+    ir::Node* op = work_list.front();
+    work_list.pop_front();
+    // get the live_in calculated before. Empty if first.
+    auto prev_live_in = std::move(live_in_[op]);
+    for (auto& s : successors_[op]) {
+      for (auto& var : live_in_[s]) {
+        live_out_[op].insert(var);
+      }
+    }
+    for (auto& var : uses_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : live_out_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : defs_[op]) {
+      live_in_[op].erase(var);
+    }
+
+    // If the live_in is not changed, then the liveness analysis of
+    // predecessors is completed.
+    //
+    // Otherwise, recalculate the predecessors liveness
+    if (live_in_[op] != prev_live_in) {
+      for (auto& pre : predecessors_[op]) {
+        work_list.push_back(pre);
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
+                                           const std::string& new_node,
+                                           int begin_idx) {
+  // update graph from begin idx to the end
+  for (size_t i = begin_idx; i != ops_.size(); ++i) {
+    auto* op = ops_[i];
+    if (uses_[op].find(old_node) != uses_[op].end()) {
+      uses_[op].erase(old_node);
+      uses_[op].insert(new_node);
+    }
+    if (defs_[op].find(old_node) != defs_[op].end()) {
+      defs_[op].erase(old_node);
+      defs_[op].insert(new_node);
+    }
+    if (live_in_[op].find(old_node) != live_in_[op].end()) {
+      live_in_[op].erase(old_node);
+      live_in_[op].insert(new_node);
+    }
+    if (live_out_[op].find(old_node) != live_out_[op].end()) {
+      live_out_[op].erase(old_node);
+      live_out_[op].insert(new_node);
+    }
+  }
+}
+
+const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+  auto it = live_in_.find(op);
+  PADDLE_ENFORCE(
+      it != live_in_.end(),
+      string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+  auto it = live_out_.find(op);
+  PADDLE_ENFORCE(
+      it != live_out_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+  auto it = uses_.find(op);
+  PADDLE_ENFORCE(
+      it != uses_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+
+std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
+
+ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
+                                               ir::Node* op) const {
+  // in ssa-graph, different version nodes have same name,
+  // this function get the latest version var before target op
+  // It may return nullptr, such as data node.
+  ir::Node* found_node = nullptr;
+  for (auto* node : ops_) {
+    if (node == op) break;
+    for (auto& output : node->outputs) {
+      if (output->Name() == name) {
+        found_node = output;
+      }
+    }
+  }
+  return found_node;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass)
+    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
+    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/analysis_var_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..144204beafb341351172c29e3b4cd41db49be6f9
--- /dev/null
+++ b/paddle/fluid/framework/details/analysis_var_pass.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+constexpr char kAllOpDescs[] = "all_op_descs";
+
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
+// sort op in bfs order
+std::vector<ir::Node*> BFSSortGraphOps(const ir::Graph& graph);
+
+class ControlFlowGraph;
+
+class AnalysisVarPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+ private:
+  // fill the variable map(var_nodes) by version.
+  void InitSSAGraphNodes() const;
+  // update program descs
+  void RenameVarInGraphDesc(const std::string& var,
+                            const std::string& cache_var, size_t idx) const;
+  // update ir nodes
+  void RenameVarInGraphNode(const std::string& var,
+                            const std::string& cache_var, size_t idx,
+                            ir::Graph* graph) const;
+
+  void SubGraphOptimize(OpDesc* op_desc) const;
+  // valid a tensor can be reuse or not
+  bool NodeCanReused(ir::Node* node) const;
+  // scan subblock and collect the output/input variables.
+  std::unordered_set<std::string> GetSubBlockVars(
+      const std::unordered_set<ir::Node*>&) const;
+  // check op has subblock or not
+  bool OpHasSubBlock(OpDesc* desc) const;
+
+ private:
+  // Reuse Node Pool, Owned.
+  mutable OrderedNodePairPool pool_;
+  // controlflow Graph
+  mutable std::unique_ptr<ControlFlowGraph> cfg_;
+  // skip set
+  mutable std::unordered_set<std::string> skip_set_;
+  // var nodes
+  mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
+};
+
+class ControlFlowGraph {
+ public:
+  ControlFlowGraph() = default;
+  // For IR Graph in parallelexecutor
+  explicit ControlFlowGraph(const ir::Graph& graph);
+
+  void LiveVariableAnalysis();
+
+  void RenameVarInCFGGraph(const std::string& old_node,
+                           const std::string& new_node, int begin_idx);
+
+  const std::set<std::string> LiveIn(ir::Node* op) const;
+  const std::set<std::string> LiveOut(ir::Node* op) const;
+  const std::set<std::string> Use(ir::Node* op) const;
+  const std::vector<ir::Node*> Ops() const;
+  std::vector<ir::Node*>& Ops();
+
+  // for ssa-graph nodes
+  ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
+
+ private:
+  void BuildCFGGraph();
+  void ConnectNodes();
+  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
+  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
+  // successors ops use the output variables.
+  NodeListMap successors_;
+  // predecessors ops generated input variables.
+  NodeListMap predecessors_;
+  // variables lived before run current op.
+  VarSetMap live_in_;
+  // variables lived after run current op.
+  VarSetMap live_out_;
+  VarSetMap uses_;  // op inputs
+  VarSetMap defs_;  // op outputs
+
+  std::vector<ir::Node*> ops_;  // op sequence by topology sort
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/analysis_var_pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bc4fd33f7058949ca60983ea666a21cb4877b3e
--- /dev/null
+++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc
@@ -0,0 +1,470 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/analysis_var_pass.h"
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class DummyOp : public OperatorBase {
+ public:
+  DummyOp(const std::string& type, const VariableNameMap& inputs,
+          const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
+};
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class AssignOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class DummyVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
+    auto& inputs = op_desc.Input("X");
+    auto type = block->Var(inputs.front())->GetType();
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(type);
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
+                  paddle::framework::AssignOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+/*
+  https://en.wikipedia.org/wiki/Live_variable_analysis
+  Create a customed classical dependency graph, left row is the instruction
+  number.
+  1. a = 1
+  2. b = a
+  3. c = a
+  4. d = b + c
+  5. e = d
+
+  a--------+
+  |        |
+  b        c
+  |        |
+  d--------+
+  |
+  e
+  Then analysis these variable's liveness range
+ */
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+inline static ProgramDesc FillProgramDesc() {
+  ProgramDesc prog;
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"b"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"c"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("sum");
+    op->SetInput("X", {"b", "c"});
+    op->SetOutput("Out", {"d"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"d"});
+    op->SetOutput("Out", {"e"});
+  }
+  return prog;
+}
+
+template <typename Container>
+inline static std::string DebugString(const Container& c) {
+  std::stringstream ss;
+  for (auto& item : c) {
+    ss << item << " ";
+  }
+  return ss.str();
+}
+
+TEST(CFGGraph, IRGraph) {
+  // prepare ir graph
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  ControlFlowGraph cfg(graph);
+  cfg.LiveVariableAnalysis();
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
+
+  // test sum op
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
+  ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
+}
+
+// 1. normal test
+TEST(SortOpLikeDescOrder, NormalTest) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto nodes = SortOpLikeDescOrder(graph);
+  auto op_descs = prog.Block(0).AllOps();
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 2. remove some op_desc
+TEST(SortOpLikeDescOrder, RemoveOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto nodes = graph.Nodes();
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->outputs.back()->Name() == "e") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  ir::Node* e = find_node_in_graph("e");
+  ir::Node* d = find_node_in_graph("d");
+  std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+  graph.RemoveNode(found_node);
+  graph.RemoveNode(e);
+
+  // other node keeps the same order
+  auto remain_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < remain_nodes.size(); ++i) {
+    auto node = remain_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 3. add some op_desc
+TEST(SortOpLikeDescOrder, AddOpDesc) {
+  auto prog = FillProgramDesc();
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  ir::Graph graph(prog);
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  // cached desc different with real one
+  // mimic the intermidiete pass modify the programdesc.
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto op_descs = prog.Block(0).AllOps();
+
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  ir::Node* node = graph.CreateOpNode(op);
+  ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+  ir::Node* b = find_node_in_graph("b");
+  ir::Node* c = find_node_in_graph("c");
+  node->outputs.emplace_back(d1);
+  node->inputs.emplace_back(b);
+  node->inputs.emplace_back(c);
+  d1->inputs.emplace_back(node);
+  b->outputs.emplace_back(node);
+  c->outputs.emplace_back(node);
+  op_descs.insert(op_descs.begin() + 4, op);
+
+  auto nodes = SortOpLikeDescOrder(graph);
+
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 4. add and delete some op_desc
+TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  // remove sum node
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  auto nodes = graph.Nodes();
+  for (auto node : nodes) {
+    if (node->Name() == "sum") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* c = find_node_in_graph("c");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(c->outputs.begin(), c->outputs.end(), found_node);
+    ir::Node* pending_op = found_node->outputs[0]->outputs[0];
+    graph.RemoveNode(e);
+    graph.RemoveNode(pending_op);
+    graph.RemoveNode(found_node);
+  }
+
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+  op_descs.insert(op_descs.begin() + 2, op);
+
+  // check the order
+  auto mynodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < mynodes.size(); ++i) {
+    auto node = mynodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 5. add and replace some op_desc inplace.
+TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  auto op_descs = prog.Block(0).AllOps();
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    d1->inputs.emplace_back(node);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+
+  op_descs.emplace_back(op);
+
+  // replace op_desc inplace
+  auto nodes = graph.Nodes();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->Op() && node->Name() == "assign") {
+      if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
+        found_node = node;
+        break;
+      }
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(e->inputs.begin(), e->inputs.end(), found_node);
+    graph.RemoveNode(found_node);
+  }
+  op_descs.erase(op_descs.begin() + 3);
+
+  auto replace_op = prog.MutableBlock(0)->AppendOp();
+  replace_op->SetType("sum");
+  replace_op->SetInput("X", {"d", "d1"});
+  replace_op->SetOutput("Out", {"e"});
+  {
+    ir::Node* sum2 = graph.CreateOpNode(replace_op);
+    ir::Node* e = find_node_in_graph("e");
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* d1 = find_node_in_graph("d1");
+    sum2->inputs.emplace_back(d);
+    sum2->inputs.emplace_back(d1);
+    sum2->outputs.emplace_back(e);
+    e->inputs.emplace_back(sum2);
+    d->outputs.emplace_back(sum2);
+    d1->outputs.emplace_back(sum2);
+  }
+
+  op_descs.emplace_back(replace_op);
+  // compare op order
+  auto graph_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < graph_nodes.size(); ++i) {
+    auto node = graph_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index d8526b3f2492992c5c0f6f5e0a85cffca7398700..779a9ed52365e66d8141f7e3a1183ef6d7832e4b 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -14,11 +14,16 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/build_strategy.h"
 
+#include <glog/logging.h>
+#include <memory>
+
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 namespace paddle {
@@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
     VLOG(1) << "CollectiveContext:" << context->String();
 
+    // NOTE(dzh): memory optimize should be a runtime pass.
+    // However, after multi_devices_pass, VarHandle, OpHandle is
+    // the de-fact IR, any reuse on Graph is meaningless.
+    // A side-effect of that, memory optimize cannot forsee the fetched vars
+    // , so fetchlist should be set persistable before call the Run interface.
+    if (strategy.memory_optimize_) {
+      auto analysis_var_pass = AppendPass("analysis_var_pass");
+    }
     // Convert graph to run on multi-devices.
     auto multi_devices_pass = AppendPass("multi_devices_pass");
     multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
@@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Add a graph print pass to record a graph with device info.
     if (!strategy_.debug_graphviz_path_.empty()) {
       auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
-      multi_devices_print_pass->SetNotOwned<const std::string>(
-          "debug_graphviz_path", &strategy_.debug_graphviz_path_);
+      const std::string graph_path =
+          string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(),
+                          "_multi_devices_graph");
+      multi_devices_print_pass->Set<std::string>(kGraphvizPath,
+                                                 new std::string(graph_path));
       multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
           "graph_printer", new details::GraphvizSSAGraphPrinter);
     }
@@ -127,7 +143,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
   CreatePassesFromStrategy(false);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
-
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
     if (pass->Type() == "multi_devices_pass") {
       pass->Erase("places");
@@ -145,6 +160,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+    } else if (pass->Type() == "analysis_var_pass") {
+      const std::vector<OpDesc *> *all_op_descs =
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
+      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
+                                              all_op_descs);  // take ownership
+      graph->Set<GraphNodePool>(kGraphNodePool,
+                                new GraphNodePool);  // take ownership
+
+      pass->Erase(kAllOpDescs);
+      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
+
     } else if (pass->Type() == "sequential_execution_pass") {
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
@@ -166,6 +192,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
   }
   return graph;
 }
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
@@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+USE_PASS(analysis_var_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index c97be169575f578dfd18a6290230d1b3f3bd7596..29396501dc0efedd31a42b77f915fd66c9943985 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -60,8 +60,15 @@ struct BuildStrategy {
     kCustomized = 2,
   };
 
+  enum class OptimizeStrategy {
+    // To be Implemented,bruteforce, recursive compute unused var names.
+    kBruteForce = 0,
+    kControlFlowGraph = 1,  // use cfg_graph algorithm, faster speed.
+  };
+
   ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+  OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph};
 
   std::string debug_graphviz_path_{""};
 
@@ -69,6 +76,10 @@ struct BuildStrategy {
 
   bool enable_data_balance_{false};
 
+  bool memory_optimize_{false};
+
+  bool memory_early_delete_{false};
+
   bool enable_sequential_execution_{false};
 
   bool fuse_broadcast_op_{false};
diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8382d34b790ba7c95415acdf0b55dc97a9cd265
--- /dev/null
+++ b/paddle/fluid/framework/details/early_delete_op_handle.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class EarlyDeleteOpHandle : public OpHandleBase {
+ public:
+  EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
+                      const platform::Place& place,
+                      const std::vector<std::string>& names,
+                      GarbageCollector* gc)
+      : OpHandleBase(node),
+        scope_(scope),
+        place_(place),
+        names_(names),
+        gc_(gc) {
+#ifdef PADDLE_WITH_CUDA
+    if (IsStreamGarabageCollector()) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(place);
+      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+    }
+#endif
+  }
+  ~EarlyDeleteOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+    if (IsStreamGarabageCollector()) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
+      PADDLE_ENFORCE(cudaEventDestroy(event_));
+    }
+#endif
+  }
+
+  std::string Name() const override { return "early_delete"; }
+
+ protected:
+  void RunImpl() override {
+    std::vector<std::shared_ptr<memory::Allocation>> tensors;
+    auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
+    for (auto& var_name : names_) {
+      auto* var = local_scope->FindVar(var_name);
+      PADDLE_ENFORCE(var != nullptr,
+                     string::Sprintf("Local Scope not has var %s", var_name));
+      if (var->IsType<LoDTensor>()) {
+        tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+      } else if (var->IsType<SelectedRows>()) {
+        tensors.emplace_back(var->GetMutable<SelectedRows>()
+                                 ->mutable_value()
+                                 ->MoveMemoryHolder());
+      } else if (var->IsType<LoDTensorArray>()) {
+        LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
+        for (auto& tensor : *tensor_array) {
+          tensors.emplace_back(tensor.MoveMemoryHolder());
+        }
+      }
+    }
+    if (!tensors.empty()) {
+      ClearTensors(tensors);
+    }
+  }
+
+ private:
+  void ClearTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+    if (platform::is_cpu_place(place_)) {
+      ClearCPUTensors(tensors);
+    } else {
+      ClearGPUTensors(tensors);
+    }
+  }
+
+  void ClearCPUTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+    auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
+    if (gc != nullptr) {
+      gc->Add(tensors);
+    }
+  }
+
+  void ClearGPUTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+#ifdef PADDLE_WITH_CUDA
+    auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
+    if (gc != nullptr) {
+      auto compute_stream = dev_ctx_->stream();
+      auto callback_stream = gc->stream();
+      auto callback_func = [=]() {
+        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+      };
+      gc_->Add(tensors, callback_func);
+    } else {
+      gc_->Add(tensors);
+    }
+  }
+
+  bool IsStreamGarabageCollector() const {
+    return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
+#endif
+  }
+
+  const Scope* scope_;
+  const platform::Place place_;
+  std::vector<std::string> names_;
+  GarbageCollector* gc_;
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext* dev_ctx_;
+  cudaEvent_t event_;
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06a2451c136e3243ba41661fa691f9a6ef8b52ac
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
+#include <queue>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
+  std::queue<VarHandleBase*> queue;
+  queue.push(var_in);
+  do {
+    auto* var = queue.front();
+    queue.pop();
+    for (auto* op : var->PendingOps()) {
+      auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
+      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
+        return compute_op;
+      }
+      for (auto* out_var : op->Outputs()) {
+        queue.push(out_var);
+      }
+    }
+  } while (!queue.empty());
+  return nullptr;
+}
+
+std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto& graph_pool = Get<GraphNodePool>(kGraphNodePool);
+  auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+
+  std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars;
+  unlived_vars.reserve(graph_pool.size());
+  for (auto& pair : graph_pool) {
+    unlived_vars.insert(std::make_pair(pair.first, pair.second));
+  }
+
+  auto compare_and_insert_early_delete_op = [&](
+      OpHandleBase* op, const std::vector<VarHandleBase*>& vars) {
+    if (unlived_vars.empty()) return;
+    // unlived vars can be deleted after the last used op has finished.
+    auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
+    const auto& places = Get<std::vector<platform::Place>>(kAllPlaces);
+    for (auto& var : vars) {
+      auto* var_handle = dynamic_cast<VarHandle*>(var);
+      auto var_name = var->Node()->Name();
+      auto& var_place = var_handle->place_;
+      if (unlived_vars.count(var_name) == 0) continue;
+      if (!unlived_vars[var_name].empty()) {
+        if (compute_op != nullptr &&
+            unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) {
+          unlived_vars[var_name].erase(compute_op->Node()->Op());
+        }
+        continue;
+      }
+
+      if (var_handle == nullptr || !var_handle->Node()->IsVar() ||
+          var_handle->Node()->IsCtrlVar())
+        continue;
+
+      // shameless copyed from reference count pass.
+      if (compute_op == nullptr) {
+        // use next computation op scope
+        compute_op = FindNextComputationOpHandle(var_handle);
+      }
+      auto* early_delete_node =
+          graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation);
+      GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get();
+      auto* early_delete_handle = new EarlyDeleteOpHandle(
+          early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc);
+      if (compute_op->Outputs().empty()) {
+        auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+        compute_op->AddOutput(dep_var);
+        graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      }
+      early_delete_handle->AddInput(compute_op->Outputs().front());
+      VLOG(5) << "Add early delete op " << var_name << " to Operator"
+              << compute_op->Name();
+    }
+  };
+
+  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+  for (auto& op : all_ops) {
+    compare_and_insert_early_delete_op(op, op->Inputs());
+    compare_and_insert_early_delete_op(op, op->Outputs());
+  }
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(memory_early_delete_pass,
+              paddle::framework::details::MemoryEarlyDeletePass)
+    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
+    .RequireGraphAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..8215aa1b2baa223a111f9050d5488c5fc8ac0e6e
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/details/early_delete_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class MemoryEarlyDeletePass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_reuse_types.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b9ff518b9adcd366cc877998400a8bdc05fa033
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_reuse_types.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+size_t NodeSizeInBytes(ir::Node* n) {
+  auto* desc = FindVarDescInBlock(n);
+  auto shape = desc->GetShape();
+  size_t type_size = SizeOfType(desc->GetDataType());
+  int size = 1;
+  for (auto& s : shape) {
+    size *= s;
+  }
+  return type_size * std::abs(size);
+}
+
+std::string DebugStringImpl(VarDesc* var) {
+  std::stringstream ss;
+  ss << var->Name();
+  ss << "[";
+  try {
+    auto shape = var->GetShape();
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (i != shape.size() - 1) {
+        ss << shape[i] << ",";
+      } else {
+        ss << shape[i];
+      }
+    }
+    ss << "]";
+  } catch (...) {
+    ss << "Var has no VarDesc !!! Name:" << var->Name();
+  }
+  return ss.str();
+}
+
+std::string DebugString(ir::Node* var) {
+  return DebugStringImpl(FindVarDescInBlock(var));
+}
+// return DebugString(var->Var()); }
+
+// NOTE(dzh): based ir node, if a large node has been reused
+// by a small size node, then next time it appear in pool, it will
+// have the small size. Find the original node shap from blockdesc.
+VarDesc* FindVarDescInBlock(ir::Node* n) {
+  PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
+  BlockDesc* block = n->inputs[0]->Op()->Block();
+  PADDLE_ENFORCE(block->HasVar(n->Name()),
+                 string::Sprintf("Block do not has var %s", n->Name()));
+  return block->FindVar(n->Name());
+}
+
+struct NodeComparator {
+  bool operator()(ir::Node* lhs, ir::Node* rhs) const {
+    auto* lhs_desc = FindVarDescInBlock(lhs);
+    auto* rhs_desc = FindVarDescInBlock(rhs);
+    auto lhs_shape = lhs_desc->GetShape();
+    auto rhs_shape = rhs_desc->GetShape();
+    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
+        (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
+      return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs);
+    } else {
+      return false;
+    }
+  }
+};
+
+void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) {
+  PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (mark_table_.count(var->Name()) != 0) {
+    mark_table_[var->Name()]->second.insert(op);
+    return;
+  }
+
+  auto* var_desc = FindVarDescInBlock(var);
+  auto var_shape = var_desc->GetShape();
+  int batch_size = static_cast<int>(var_shape[0]);
+
+  NodeComparator compare_node;
+  Iter it = nodes_.begin();
+  while (it != nodes_.end()) {
+    auto* cache_desc = FindVarDescInBlock(it->first);
+    int cache_batch_size = cache_desc->GetShape()[0];
+    if ((cache_batch_size == -1 && batch_size == -1) ||
+        (cache_batch_size != -1 && batch_size != -1)) {
+      if (compare_node(it->first, var)) {
+        ++it;
+      } else {
+        break;
+      }
+    } else if (cache_batch_size == -1 && batch_size != -1) {
+      ++it;
+    } else if (cache_batch_size != -1 && batch_size == -1) {
+      break;
+    }
+  }
+
+  it =
+      nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
+  mark_table_[var->Name()] = it;
+}
+
+int OrderedNodePairPool::GetIndex(ir::Node* var) {
+  return std::distance(nodes_.begin(), mark_table_[var->Name()]);
+}
+
+ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const {
+  ir::Node* found_node = nullptr;
+  NodeComparator compare_node;
+
+  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+    if (compare_node(var, it->first)) {
+      found_node = it->first;
+      break;
+    }
+  }
+  return found_node;
+}
+
+void OrderedNodePairPool::Erase(ir::Node* var) {
+  PADDLE_ENFORCE(mark_table_.count(var->Name()));
+  nodes_.erase(mark_table_[var->Name()]);
+  mark_table_.erase(var->Name());
+}
+
+std::string OrderedNodePairPool::ToString() const {
+  std::stringstream ss;
+  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+    ss << DebugString(it->first) << " ";
+  }
+  return ss.str();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_reuse_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a9c1d948e869016717fea9ff6b8236adfc29845
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_reuse_types.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <list>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kFetchedVars[] = "fetched_vars";
+constexpr char kGraphNodePool[] = "graph_node_pool";
+
+// NOTE(dzh): Variable and the operators use the var.
+// for early delete pass.
+// Because analysis var pass build base on ir::Node, which maybe released
+// or modified between passes, so we use OpDesc* to mark ops.
+using GraphNodePool = std::vector<
+    std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
+
+// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
+// in fluid, -1 means the batch_size is determined in runtime.
+// the node batch_size equal -1 always ranking in the front than the node not.
+// For example,
+// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
+// O(1) insert, delete
+class OrderedNodePairPool {
+ public:
+  using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
+  using Iter = typename std::list<NodePair>::iterator;
+  using ConstIter = typename std::list<NodePair>::const_iterator;
+
+  void Insert(ir::Node* var, ir::Node* op);
+
+  void Erase(ir::Node* var);
+
+  bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
+
+  ir::Node* NodeMatch(ir::Node* var) const;
+  // map store non-const iterator, can not promise const
+  int GetIndex(ir::Node* var);
+  // pool all node to string
+  std::string ToString() const;
+
+  Iter begin() { return nodes_.begin(); }
+  Iter end() { return nodes_.end(); }
+  ConstIter begin() const { return nodes_.begin(); }
+  ConstIter end() const { return nodes_.end(); }
+  size_t size() const { return nodes_.size(); }
+
+ private:
+  // for searching.
+  std::unordered_map<std::string, Iter> mark_table_;
+  // node swap pairs. var -> ops dep var
+  std::list<NodePair> nodes_;
+};
+
+// node memory size in bytes
+size_t NodeSizeInBytes(ir::Node* n);
+
+std::string DebugString(ir::Node* var);
+
+// std::string DebugString(VarDesc* var);
+VarDesc* FindVarDescInBlock(ir::Node* n);
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_reuse_types_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2fabf5ce068e0f752b86c0d02b971f18fc65f01
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+TEST(OrderedNodePairPool, Normal) {
+  OrderedNodePairPool pool;
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+
+  // clang-format off
+  std::vector<std::vector<int64_t>> shapes = {{-1, 10},
+                                              {-1, 20},
+                                              {1, 2},
+                                              {5, 2},
+                                              {10, 20},
+                                              {-1, 2, 5},
+                                              {-1, 1, 5},
+                                              {-1, 1}};
+  // clang-format on
+  const int COUNT = shapes.size();
+  ProgramDesc prog;
+  BlockDesc* block_desc = prog.MutableBlock(0);
+  auto* op_desc = block_desc->AppendOp();
+  op_desc->SetType("dummy");
+  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
+
+  for (int i = 0; i < COUNT; ++i) {
+    auto desc = block_desc->Var(std::to_string(i));
+    desc->SetShape(shapes[i]);
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+
+  for (auto& node : nodes) {
+    pool.Insert(node.get(), op.get());
+  }
+
+  // assert its order and interface.
+  std::cout << pool.ToString() << std::endl;
+  pool.Erase(nodes.front().get());
+  std::cout << pool.ToString() << std::endl;
+
+  ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
+  ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0);
+
+  {
+    auto v1 = block_desc->Var("11");
+    v1->SetShape({-1, 256, 56, 56});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(cache, nullptr);
+  }
+  {
+    auto v2 = block_desc->Var("12");
+    v2->SetShape({-1, 2, 5});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(pool.GetIndex(cache), 2);  // match 6:[-1,2,5]
+  }
+  {
+    auto v3 = block_desc->Var("13");
+    v3->SetShape({2, 5});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(pool.GetIndex(cache), 5);  // match  4:[5,2]
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index 8f92f0948d7d397ab0f20c01eae9e313f739adec..c203073845375c879a0fc10564f5dad0f19ceae4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
 }  // namespace paddle
 
 REGISTER_PASS(multi_devices_print_pass,
-              paddle::framework::details::SSAGraghBuilderWithPrinter);
+              paddle::framework::details::SSAGraghBuilderWithPrinter)
+    .RequirePassAttr(paddle::framework::details::kGraphvizPath);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
index c00685fa1629c0722c315c726053c2cba8bf17e7..b06c87a5c185c550818af0bdeacd0070d1d90e4e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <glog/logging.h>
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
@@ -24,6 +25,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kGraphvizPath[] = "debug_graphviz_path";
+
 class SSAGraphPrinter {
  public:
   virtual ~SSAGraphPrinter() {}
@@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass {
   std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override {
     std::unique_ptr<std::ostream> fout(
-        new std::ofstream(Get<const std::string>("debug_graphviz_path")));
+        new std::ofstream(Get<std::string>(kGraphvizPath)));
     PADDLE_ENFORCE(fout->good());
     Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
     return graph;
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index ba12ca3c61c05b3e856fffa8353d4ec5bf79bc39..b1a82e8771b92f2d0af4a1c7732ff2da54d496a8 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -25,7 +25,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
+constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@";
 
 // Wraps ir::Node and provide helper utilities.
 // It's responsible for populating necessary fields of ir::Node.
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8c3912120b5b405eb14da8493aaae6e7d12537a0..da9556c6c1f3468208db02f2958ad6ad137c6566 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -100,7 +100,7 @@ static void DeleteUnusedTensors(
           continue;
         }
         auto* var = scope.FindVar(name);
-        if (var != nullptr) {
+        if (var == nullptr) {
           continue;
         }
 
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 8725ed46d595a2edb16a71ed5f23e706448e3ecd..2eb9e564f87807e88def536ee875ebe0d1e83cd6 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor_thread_worker.h"
+#include <algorithm>
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
@@ -32,6 +33,89 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_PSLIB
+int DensePullThread::start() {
+  _running = true;
+  _t = std::thread(&DensePullThread::run, this);
+  return 0;
+}
+
+void DensePullThread::run() {
+  while (_running) {
+    _pull_dense_status.resize(0);
+    for (auto& t : _dense_variable_name) {
+      if (check_update_param(t.first)) {
+        auto status = pull_dense(t.first);
+        _pull_dense_status.emplace_back(std::move(status));
+        reset_thread_version(t.first);
+      }
+    }
+    if (_pull_dense_status.size() != 0) {
+      wait_all();
+    }
+
+    usleep(_sleep_time_ms * 1000);
+  }
+}
+bool DensePullThread::check_update_param(uint64_t table_id) {
+  {
+    std::lock_guard<std::mutex> lock(_mutex_for_version);
+    auto& version = _training_versions[table_id];
+    _current_version[table_id] =
+        *(std::min_element(version.begin(), version.end()));
+  }
+  if (_current_version[table_id] - _last_versions[table_id] < _threshold) {
+    return false;
+  }
+  return true;
+}
+
+void DensePullThread::reset_thread_version(uint64_t table_id) {
+  std::lock_guard<std::mutex> lock(_mutex_for_version);
+  _last_versions[table_id] = _current_version[table_id];
+}
+std::future<int32_t> DensePullThread::pull_dense(uint64_t table_id) {
+  auto& regions = _regions[table_id];
+  regions.clear();
+  auto& variables = _dense_variable_name[table_id];
+  regions.resize(variables.size());
+
+  for (auto i = 0u; i < variables.size(); ++i) {
+    auto& t = variables[i];
+    Variable* var = _root_scope->FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+
+    float* w = tensor->data<float>();
+    paddle::ps::Region reg(w, tensor->numel());
+    regions[i] = std::move(reg);
+  }
+  return _ps_client->pull_dense(regions.data(), regions.size(), table_id);
+}
+
+void DensePullThread::wait_all() {
+  for (auto& t : _pull_dense_status) {
+    t.wait();
+    auto status = t.get();
+    if (status != 0) {
+      LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times;
+    }
+  }
+
+  if (_pull_dense_fail_times > 20) {
+    LOG(FATAL) << "pull dense failed times more than 20 times";
+    exit(-1);
+  }
+
+  _pull_dense_status.resize(0);
+}
+
+void DensePullThread::increase_thread_version(int thread_id,
+                                              uint64_t table_id) {
+  std::lock_guard<std::mutex> lock(_mutex_for_version);
+  _training_versions[table_id][thread_id]++;
+}
+#endif
+
 void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
   auto& block = program.Block(0);
   op_names_.clear();
@@ -202,5 +286,358 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
   root_scope_ = g_scope;
 }
 
+#ifdef PADDLE_WITH_PSLIB
+//  AsyncExecutor
+void AsyncExecutorThreadWorker::TrainFiles() {
+  SetDevice();
+
+  int fetch_var_num = fetch_var_names_.size();
+  fetch_values_.clear();
+  fetch_values_.resize(fetch_var_num);
+
+  thread_reader_->Start();
+
+  int cur_batch;
+  int batch_cnt = 0;
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    // executor run here
+    TrainOneNetwork();
+
+    ++batch_cnt;
+    thread_scope_->DropKids();
+
+    if (debug_ == false || thread_id_ != 0) {
+      continue;
+    }
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      print_fetch_var(thread_scope_, fetch_var_names_[i]);
+    }  // end for (int i = 0...)
+  }    // end while ()
+}
+
+void AsyncExecutorThreadWorker::SetPSlibPtr(
+    std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
+  _pslib_ptr = pslib_ptr;
+}
+void AsyncExecutorThreadWorker::SetPullDenseThread(
+    std::shared_ptr<DensePullThread> dpt) {
+  _pull_dense_thread = dpt;
+}
+void AsyncExecutorThreadWorker::TrainOneNetwork() {
+  PrepareParams();
+
+  for (auto& op : ops_) {
+    if (op->Type().find("sgd") != std::string::npos) {
+      continue;
+    }
+    bool need_skip = false;
+    for (auto t = 0u; t < _param_config->skip_op.size(); ++t) {
+      if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) {
+        need_skip = true;
+        break;
+      }
+    }
+    if (!need_skip) {
+      op->Run(*thread_scope_, place_);
+    }
+  }
+  UpdateParams();
+}
+
+void AsyncExecutorThreadWorker::SetParamConfig(
+    AsyncWorkerParamConfig* param_config) {
+  _param_config = param_config;
+}
+
+void AsyncExecutorThreadWorker::PrepareParams() {
+  for (auto table_id : _param_config->sparse_table_id) {
+    PullSparse(table_id);
+    for (auto& t : _pull_sparse_status) {
+      t.wait();
+      auto status = t.get();
+      if (status != 0) {
+        LOG(ERROR) << "pull sparse failed, status[" << status << "]";
+        exit(-1);
+      }
+    }
+  }
+  _pull_sparse_status.resize(0);
+
+  for (auto table_id : _param_config->sparse_table_id) {
+    FillSparse(table_id);
+  }
+}
+
+void AsyncExecutorThreadWorker::UpdateParams() {
+  for (auto i : _param_config->sparse_table_id) {
+    PushSparse(i);
+  }
+  for (auto i : _param_config->dense_table_id) {
+    PushDense(i);
+  }
+  int32_t tmp_push_dense_wait_times = -1;
+  int32_t tmp_push_sparse_wait_times = -1;
+  static uint32_t push_dense_wait_times =
+      static_cast<uint32_t>(tmp_push_dense_wait_times);
+  static uint32_t push_sparse_wait_times =
+      static_cast<uint32_t>(tmp_push_sparse_wait_times);
+
+  if (_push_dense_status.size() >= push_dense_wait_times) {
+    for (auto& t : _push_dense_status) {
+      t.wait();
+    }
+    _push_dense_status.resize(0);
+  }
+  if (tmp_push_dense_wait_times == -1) {
+    _push_dense_status.resize(0);
+  }
+  if (_push_sparse_status.size() >= push_sparse_wait_times) {
+    for (auto& t : _push_sparse_status) {
+      t.wait();
+    }
+    _push_sparse_status.resize(0);
+  }
+  if (tmp_push_sparse_wait_times == -1) {
+    _push_sparse_status.resize(0);
+  }
+  for (auto dense_table_id : _param_config->dense_table_id) {
+    _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id);
+  }
+}
+
+void AsyncExecutorThreadWorker::PushDense(int table_id) {
+  std::vector<paddle::ps::Region> regions;
+  for (auto& t : _param_config->dense_gradient_variable_name[table_id]) {
+    Variable* var = thread_scope_->FindVar(t);
+    CHECK(var != nullptr) << "var[" << t << "] not found";
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int count = tensor->numel();
+    float* g = tensor->data<float>();
+    paddle::ps::Region reg(g, count);
+    regions.emplace_back(std::move(reg));
+  }
+
+  auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(),
+                                                    regions.size(), table_id);
+  _push_dense_status.push_back(std::move(status));
+}
+
+void AsyncExecutorThreadWorker::PullSparse(int table_id) {
+  auto& features = _features[table_id];
+  auto& feature_value = _feature_value[table_id];
+  auto fea_dim = _param_config->fea_dim;
+  // slot id starts from 1
+  features.clear();
+  features.resize(0);
+  features.reserve(MAX_FEASIGN_NUM);
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
+  // slot_idx = 0 is label TODO
+  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
+    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int64_t* ids = tensor->data<int64_t>();
+    int len = tensor->numel();
+    for (auto i = 0u; i < len; ++i) {
+      // todo(colourful-tree): current trick - filter feasign=use_slot_mod(
+      // bug: datafeed fill use_slot_mod for empty slot)
+      if (ids[i] == 0u) {
+        continue;
+      }
+      features.push_back(static_cast<uint64_t>(ids[i]));
+    }
+  }
+  check_pull_push_memory(features, &feature_value, fea_dim);
+
+  std::vector<float*> pull_feature_value;
+  for (auto i = 0u; i < features.size(); ++i) {
+    pull_feature_value.push_back(feature_value[i].data());
+  }
+
+  auto status = _pslib_ptr->_worker_ptr->pull_sparse(
+      pull_feature_value.data(), table_id, features.data(), features.size());
+  _pull_sparse_status.push_back(std::move(status));
+
+  auto& push_g = _feature_push_value[table_id];
+  check_pull_push_memory(features, &push_g, fea_dim);
+
+  collect_feasign_info(table_id);
+}
+
+void AsyncExecutorThreadWorker::FillSparse(int table_id) {
+  auto slot_dim = _param_config->slot_dim;
+  auto fea_dim = _param_config->fea_dim;
+  auto& features = _features[table_id];
+  auto& fea_value = _feature_value[table_id];
+
+  CHECK(features.size() > 0) << "feature size check failed";
+
+  auto fea_idx = 0u;
+
+  std::vector<float> init_value(fea_dim);
+
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
+  // slot_idx = 0 is label TODO
+  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
+    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int64_t* ids = tensor->data<int64_t>();
+    int len = tensor->numel();
+    Variable* var_emb = thread_scope_->FindVar(
+        _param_config->slot_input_vec[table_id][slot_idx - 1]);
+    LoDTensor* tensor_emb = var_emb->GetMutable<LoDTensor>();
+    float* ptr =
+        tensor_emb->mutable_data<float>({len, slot_dim}, platform::CPUPlace());
+    memset(ptr, 0, sizeof(float) * len * slot_dim);
+    auto& tensor_lod = tensor->lod()[0];
+
+    LoD data_lod{tensor_lod};
+    tensor_emb->set_lod(data_lod);
+
+    for (auto index = 0u; index < len; ++index) {
+      if (ids[index] == 0u) {
+        memcpy(ptr + slot_dim * index, init_value.data() + 2,
+               sizeof(float) * slot_dim);
+        continue;
+      }
+      memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2,
+             sizeof(float) * slot_dim);
+      fea_idx++;
+    }
+  }
+}
+
+void AsyncExecutorThreadWorker::PushSparse(int table_id) {
+  auto slot_dim = _param_config->slot_dim;
+  auto fea_dim = _param_config->fea_dim;
+  auto& features = _features[table_id];
+  auto& push_g = _feature_push_value[table_id];
+  check_pull_push_memory(features, &push_g, fea_dim);
+  CHECK(push_g.size() == features.size() + 1)
+      << "push_g size:" << push_g.size()
+      << " features size:" << features.size();
+  uint64_t fea_idx = 0u;
+  auto& fea_info = _fea_info[table_id];
+  int offset = 2;
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
+  // slot_idx = 0 is label
+  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
+    if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) ==
+        _param_config->slot_alias_to_table.end()) {
+      LOG(ERROR) << "ERROR slot_idx:" << slot_idx
+                 << " name:" << feed_vec[slot_idx];
+    } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] !=
+               table_id) {
+      continue;
+    }
+    Variable* g_var = thread_scope_->FindVar(
+        _param_config->gradient_var[table_id][slot_idx - 1]);
+    CHECK(g_var != nullptr)
+        << "var[" << _param_config->gradient_var[table_id][slot_idx - 1]
+        << "] not found";
+    LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
+    if (g_tensor == NULL) {
+      LOG(ERROR) << "var["
+                 << _param_config->gradient_var[table_id][slot_idx - 1]
+                 << "] not found";
+      exit(-1);
+    }
+    float* g = g_tensor->data<float>();
+
+    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
+    CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found";
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    if (tensor == NULL) {
+      LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found";
+      exit(-1);
+    }
+    int len = tensor->numel();
+    CHECK(slot_dim * len == g_tensor->numel())
+        << "len:" << len << " g_numel:" << g_tensor->numel();
+    CHECK(len == tensor->numel()) << "len:" << len
+                                  << "t_numel:" << tensor->numel();
+    int64_t* ids = tensor->data<int64_t>();
+    for (auto id_idx = 0u; id_idx < len; ++id_idx) {
+      if (ids[id_idx] == 0) {
+        g += slot_dim;
+        continue;
+      }
+      memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim);
+      push_g[fea_idx][0] = 1.0f;
+      CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx
+                                       << " size:" << fea_info.size();
+      push_g[fea_idx][1] = static_cast<float>(fea_info[fea_idx].label);
+      g += slot_dim;
+      fea_idx++;
+    }
+  }
+  CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx
+                                    << " features size:" << features.size();
+  CHECK_GT(features.size(), 0);
+
+  std::vector<float*> push_g_vec;
+  for (auto i = 0u; i < features.size(); ++i) {
+    push_g_vec.push_back(push_g[i].data());
+  }
+  auto status = _pslib_ptr->_worker_ptr->push_sparse(
+      table_id, features.data(), (const float**)push_g_vec.data(),
+      features.size());
+  _push_sparse_status.push_back(std::move(status));
+}
+
+void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) {
+  auto& fea_info = _fea_info[table_id];
+  auto& feature = _features[table_id];
+  fea_info.resize(feature.size());
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
+  Variable* var = thread_scope_->FindVar(feed_vec[0]);
+  LoDTensor* tensor = var->GetMutable<LoDTensor>();
+  int64_t* label = tensor->data<int64_t>();
+
+  int global_index = 0;
+  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
+    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int64_t* ids = tensor->data<int64_t>();
+
+    int fea_idx = 0;
+    for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) {
+      for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) {
+        if (ids[fea_idx] == 0u) {
+          continue;
+        }
+        FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]};
+
+        fea_info[global_index++] = std::move(info);
+      }
+    }
+  }
+  CHECK(global_index == feature.size())
+      << "expect fea info size:" << feature.size() << " real:" << global_index;
+}
+
+void AsyncExecutorThreadWorker::check_pull_push_memory(
+    const std::vector<uint64_t>& features,
+    std::vector<std::vector<float>>* push_g, int dim) {
+  push_g->resize(features.size() + 1);
+  for (auto& t : *push_g) {
+    t.resize(dim);
+  }
+}
+
+void AsyncExecutorThreadWorker::check_pull_push_memory(
+    const std::vector<uint64_t>& features, std::vector<float*>* push_g,
+    int dim) {
+  if (features.size() > push_g->size()) {
+    push_g->reserve(features.size() + 1);
+    auto size = features.size() - push_g->size() + 1;
+    for (auto i = 0u; i < size; ++i) {
+      float* ptr = new float[dim];
+      push_g->push_back(ptr);
+    }
+  }
+}
+#endif
+
 }  // einit_modelnd namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
index 13ec2442c46459116320236bf98f23c91340f389..30b81ad88035eacc7a8efbe6d20f03d362122003 100644
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -25,16 +25,119 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_PSLIB
+#include <pslib.h>
+#endif
 
 namespace paddle {
 namespace framework {
+
 void CreateTensor(Variable* var, proto::VarType::Type var_type);
+#ifdef PADDLE_WITH_PSLIB
+static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
+
+struct AsyncWorkerParamConfig {
+  int slot_dim;
+  int fea_dim;
+  int32_t tmp_push_dense_wait_times;
+  int32_t tmp_push_sparse_wait_times;
+
+  std::vector<std::string> skip_op;
+
+  std::map<uint64_t, std::vector<std::string>> dense_variable_name;
+  std::map<uint64_t, std::vector<std::string>> dense_gradient_variable_name;
+  std::vector<int> dense_table_id;
+  // fea_dim for each dense table
+  std::vector<uint32_t> dense_table_size;
+  std::vector<int> sparse_table_id;
+  std::map<uint64_t, std::vector<std::string>> slot_input_vec;
+  std::map<uint64_t, std::vector<std::string>> gradient_var;
+  std::map<std::string, uint64_t> slot_alias_to_table;
+};
+
+struct DensePullThreadParam {
+  std::shared_ptr<paddle::ps::PSClient> ps_client;
+  int threshold;
+  int training_thread_num;
+  Scope* root_scope;
+  std::map<uint64_t, std::vector<std::string>>* dense_params;
+  int sleep_time_ms = 2;
+};
+
+class DensePullThread {
+ public:
+  explicit DensePullThread(const DensePullThreadParam& param)
+      : _running(false) {
+    _ps_client = param.ps_client;
+    _threshold = param.threshold;
+    _thread_num = param.training_thread_num;
+    _root_scope = param.root_scope;
+    _sleep_time_ms = param.sleep_time_ms;
+
+    for (auto& t : *param.dense_params) {
+      _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(),
+                                           t.second.begin(), t.second.end());
+      _training_versions[t.first].resize(_thread_num, 0);
+      _last_versions[t.first] = 0;
+      _current_version[t.first] = 0;
+    }
+  }
+
+  int start();
+
+  void stop() {
+    if (_running) {
+      _running = false;
+      _t.join();
+    }
+  }
+
+  void increase_thread_version(int thread_id, uint64_t table_id);
+  void reset_thread_version(uint64_t table_id);
+  std::future<int32_t> pull_dense(uint64_t table_id);
+  void pull_dense2(uint64_t table_id);
+  void wait_all();
+
+ private:
+  void run();
+  bool check_update_param(uint64_t table_id);
+
+ private:
+  std::shared_ptr<paddle::ps::PSClient> _ps_client;
+  int _thread_num;
+  int _threshold;
+  int _sleep_time_ms;
+  Scope* _root_scope;
+  bool _running;
+
+  std::map<uint64_t, uint64_t> _last_versions;
+  std::map<uint64_t, uint64_t> _current_version;
+  std::mutex _mutex_for_version;
+  std::map<uint64_t, std::vector<uint64_t>> _training_versions;
+  std::map<uint64_t, std::vector<std::string>> _dense_variable_name;
+
+  std::thread _t;
+
+  std::vector<::std::future<int32_t>> _pull_dense_status;
+
+  std::map<uint64_t, std::vector<paddle::ps::Region>> _regions;
+  uint32_t _pull_dense_fail_times = 0;
+
+  std::vector<float> _base_norm_param;
+  std::vector<float> _mean;
+  std::vector<float> _scale;
+  float _squared_sum_epsilon = 1e-4;
+  std::mutex _mutex_for_mean_scale;
+
+  float _total_batch_num = 0;
+};
+#endif
 
 class ExecutorThreadWorker {
  public:
   ExecutorThreadWorker()
       : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
-  ~ExecutorThreadWorker() {}
+  virtual ~ExecutorThreadWorker() {}
 
   void CreateThreadResource(const framework::ProgramDesc& program,
                             const paddle::platform::Place& place);
@@ -51,9 +154,15 @@ class ExecutorThreadWorker {
   // set data feed declared in executor
   void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
   // A multi-thread training function
-  void TrainFiles();
+  virtual void TrainFiles();
   // set fetch variable names from python interface assigned by users
   void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
+#ifdef PADDLE_WITH_PSLIB
+  virtual void SetPSlibPtr(
+      std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {}
+  virtual void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt) {}
+  virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}
+#endif
 
  private:
   void CreateThreadScope(const framework::ProgramDesc& program);
@@ -77,12 +186,58 @@ class ExecutorThreadWorker {
   Scope* root_scope_;
   // a thread scope, father scope is global score which is shared
   Scope* thread_scope_;
-
- private:
   std::vector<std::string> fetch_var_names_;
   std::vector<std::vector<float>> fetch_values_;
   bool debug_;
 };
 
+#ifdef PADDLE_WITH_PSLIB
+class AsyncExecutorThreadWorker : public ExecutorThreadWorker {
+ public:
+  AsyncExecutorThreadWorker() {}
+  virtual ~AsyncExecutorThreadWorker() {}
+  void SetPSlibPtr(std::shared_ptr<paddle::distributed::PSlib> pslib_ptr);
+  void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt);
+  void SetParamConfig(AsyncWorkerParamConfig* param_config);
+  void TrainFiles();
+  void TrainOneNetwork();
+  void PrepareParams();
+  void UpdateParams();
+  void PullSparse(int table_id);
+  void FillSparse(int table_id);
+  void PushSparse(int table_id);
+  void PushDense(int table_id);
+
+  void check_pull_push_memory(const std::vector<uint64_t>& features,
+                              std::vector<float*>* push_g, int dim);
+  void check_pull_push_memory(const std::vector<uint64_t>& features,
+                              std::vector<std::vector<float>>* push_g, int dim);
+  void collect_feasign_info(int table_id);
+
+ private:
+  struct FeasignInfo {
+    uint32_t slot;
+    uint32_t ins;
+    int64_t label;
+  };
+
+  std::map<uint64_t, std::vector<uint64_t>> _features;
+  std::map<uint64_t, std::vector<FeasignInfo>> _fea_info;
+  std::map<uint64_t, std::vector<std::vector<float>>> _feature_value;
+  std::map<uint64_t, std::vector<std::vector<float>>> _feature_push_value;
+
+  std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
+
+  std::shared_ptr<DensePullThread> _pull_dense_thread;
+
+  std::vector<::std::future<int32_t>> _pull_sparse_status;
+  std::vector<::std::future<int32_t>> _pull_dense_status;
+  std::vector<::std::future<int32_t>> _push_sparse_status;
+  std::vector<::std::future<int32_t>> _push_dense_status;
+
+  AsyncWorkerParamConfig* _param_config;
+};
+#endif
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index be4151b54b6087c10989c093bf007ccf3006bd65..b7f7e2ee8ef590c0d0d8307de4400a8ce8ad4e7d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -44,6 +44,7 @@ pass_library(seqconv_eltadd_relu_fuse_pass inference)
 pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
+pass_library(conv_elementwise_add_fuse_pass inference)
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base)
     pass_library(depthwise_conv_mkldnn_pass base)
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..476c9dbc353f865916d0065bbce653d7b7204dce
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                    \
+  GET_IR_NODE(conv_op);              \
+  GET_IR_NODE(conv_out);             \
+  GET_IR_NODE(conv_filter);          \
+  GET_IR_NODE(elementwise_add_op);   \
+  GET_IR_NODE(elementwise_add_in_y); \
+  GET_IR_NODE(elementwise_add_out);
+
+std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("conv2d", "Input")
+                ->AsInput();
+
+  patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string output_name = elementwise_add_out->Name();
+
+    std::string act_type = "identity";
+    framework::OpDesc new_op_desc(base_op_desc, nullptr);
+    new_op_desc.SetType("conv2d_fusion");
+    new_op_desc.SetInput("Bias", {bias_name});
+    new_op_desc.SetInput("ResidualData", {});
+    new_op_desc.SetAttr("activation", act_type);
+    new_op_desc.SetOutput("Output", {output_name});
+    new_op_desc.SetAttr("is_test", true);
+    new_op_desc.SetAttr("use_cudnn", false);
+    new_op_desc.Flush();
+
+    // Create a new node for the fused op.
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);           // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, elementwise_add_out);   // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op});
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..f234603f5856a9238164f7fb0e5cc81ea9b7ed60
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAddFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 8679118fe28b1c68aea30caf711441823b5255c0..8670dcfed7e40473e06cd12cecc1157dd4f54aa0 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -162,7 +162,10 @@ void Graph::ResolveHazard(
           (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
       const auto &read_ops = (*it_old)->outputs;
 
-      PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
+      PADDLE_ENFORCE(
+          write_op,
+          string::Sprintf("The write_op of var %s should not be empty.",
+                          (*it_new)->Name()));
 
       // Add write after write dependence
       ir::Node *upstream_op =
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index d2d28793c4320e3664bb69c65dab4fec830e4d02..d99f856d8f46ea760ce07533446ce3bec95d7d27 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
+#include <unordered_map>
 #include <unordered_set>
 
 DEFINE_string(print_sub_graph_dir, "",
@@ -121,7 +122,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
 }
 
 size_t GraphNum(const Graph &graph) {
-  std::unordered_set<ir::Node *> nodes = graph.Nodes();
+  std::unordered_set<ir::Node *> nodes(graph.Nodes());
   std::unordered_set<ir::Node *> visited_nodes;
   visited_nodes.reserve(nodes.size());
   std::deque<ir::Node *> q_nodes;
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 8d92c406689ab3a97596a8666ceb452aec4be170..be525151f9f9749b913a7e5111e5622d868bd266 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index bf12d12459c59304167d9c52059a068b50de3980..13d752e5167c039ec8d9e4300b190a726bb02a63 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -17,7 +17,6 @@
 #include <string>
 #include <vector>
 
-#include "graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
@@ -1210,6 +1209,33 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
   return act_out;
 }
 
+PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
+  conv_in->AsInput();
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->AsOutput();
+
+  conv_op->LinksFrom({conv_in, conv_filter});
+  conv_out->LinksFrom({conv_op});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+
+  return elementwise_add_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 0fee2f1c1852b296b3599d4b7219a032062a1d49..eaedd9d08e0fab820481d6eaacb6e7bfc1ab6d1d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -716,6 +716,24 @@ struct ConvElementwiseadd2Act : public PatternBase {
   PATTERN_DECL_NODE(act_out);
 };
 
+// Conv + ElementwiseAdd
+// This pattern should be used after ConvElementwiseadd2Act or
+// ConvElementwiseadd pass
+struct ConvElementwiseadd : public PatternBase {
+  ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_elementwiseadd") {}
+
+  PDNode* operator()(PDNode* conv_in);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(conv_filter);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);
+  PATTERN_DECL_NODE(elementwise_add_out);
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index eac67108e2106e986cbe1255a64c956153bc5560..45d81b937392244f678fbd01395b3ffffd07f710 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -30,6 +30,14 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
   return std::unique_ptr<Node>(new Node(name, type));
 }
 
+std::unique_ptr<Node> CreateNodeForTest(VarDesc *var_desc) {
+  return std::unique_ptr<Node>(new Node(var_desc));
+}
+
+std::unique_ptr<Node> CreateNodeForTest(OpDesc *op_desc) {
+  return std::unique_ptr<Node>(new Node(op_desc));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index d2a393b3f19e9aab79098757dae663d030b0fa2b..89dcc677b57eba356c0b6af857f9f8ff6273a683 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <typeindex>
 #include <typeinfo>
 #include <vector>
-
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/macros.h"
@@ -125,6 +124,8 @@ class Node {
   friend class Graph;
   friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                                  Node::Type type);
+  friend std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
+  friend std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
 
   explicit Node(const std::string& name, Type type)
       : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
@@ -152,7 +153,9 @@ class Node {
 
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                         Node::Type type);
+std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
 
+std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index eb4baa06b5284512eab128e57f893bad43afda97..7e3fe02eaf5560ef03e42c6b82ed338edc30b0ab 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
+#include <algorithm>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -93,6 +94,7 @@ class ParallelExecutorPrivate {
     }
   }
 
+  BuildStrategy build_strategy_;
   std::vector<platform::Place> places_;
   std::vector<Scope *> local_scopes_;
   Scope *global_scope_;  // not owned
@@ -169,6 +171,14 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
     graph = eager_deletion_pass->Apply(std::move(graph));
     VLOG(10) << "EagerDeletionPass Applied";
+
+    if (build_strategy_.memory_early_delete_) {
+      auto early_delete_pass =
+          ir::PassRegistry::Instance().Get("memory_early_delete_pass");
+      early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+      graph = early_delete_pass->Apply(std::move(graph));
+    }
+    VLOG(10) << "MemoryEarlyDeletePass Applied.";
   }
 
   return graph;
@@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor(
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
+  member_->build_strategy_ = build_strategy;
   member_->use_all_reduce_ =
       build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
 
@@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor(
       build_strategy.Apply(main_program, member_->places_, loss_var_name,
                            params, member_->local_scopes_, member_->use_cuda_);
 #endif
-
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
     graph = member_->PrepareGCAndRefCnts(std::move(graph),
@@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor(
 
   if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
     member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+        exec_strategy, member_->local_scopes_, member_->places_,
+        std::move(graph)));
   } else {
     member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+        exec_strategy, member_->local_scopes_, member_->places_,
+        std::move(graph)));
   }
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle
 
+USE_PASS(memory_early_delete_pass);
 USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index a0a9a573603ceb6b577529101cb331adbc81337a..83dea8639010f77619a6fc2a81e092ae513c6e79 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -74,6 +74,22 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
                                         platform::CPUPlace());
     EXPECT_EQ(p1, p2);
+
+    float* p3 = nullptr;
+    float* p4 = nullptr;
+    // set src_tensor a different type but smaller size.
+    // memory block is supposed to be unchanged.
+    auto* tmp = src_tensor.mutable_data<uint8_t>(framework::make_ddim({2, 2}),
+                                                 platform::CPUPlace());
+    p3 = reinterpret_cast<float*>(tmp);
+    EXPECT_EQ(p1, p3);
+
+    // set src_tensor a different type but bigger size.
+    // memory block is supposed to be changed.
+    auto* tmp2 = src_tensor.mutable_data<double>(
+        framework::make_ddim({2, 2, 3}), platform::CPUPlace());
+    p4 = reinterpret_cast<float*>(tmp2);
+    EXPECT_NE(p1, p4);
   }
   // Not sure if it's desired, but currently, Tensor type can be changed.
   {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 058a5b5f460d2bd3c4c0248929dd0c87f7506930..b80e7ef752c5251e3ea3f9d9c11f6a2b1422cd34 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -26,9 +26,6 @@ endif(WIN32)
 # paddle_fluid_origin exclude inference api interface
 if(WIN32)
   sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-  if(WITH_GPU AND NOT WITH_DSO)
-    target_link_libraries(paddle_fluid_origin ${cuda_modules})
-  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 endif(WIN32)
@@ -44,9 +41,6 @@ set(SHARED_INFERENCE_SRCS
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
     analysis_config paddle_pass_builder)
-  if(WITH_GPU AND NOT WITH_DSO)
-    target_link_libraries(paddle_fluid ${cuda_modules})
-  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
     analysis_config paddle_pass_builder)
@@ -63,9 +57,6 @@ if(WIN32)
   sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
           DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
   target_link_libraries(paddle_fluid_shared shlwapi)
-  if(WITH_GPU AND NOT WITH_DSO)
-    target_link_libraries(paddle_fluid_origin ${cuda_modules})
-  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
       DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 4ffe5f575c232ccfc0089cb86e28737e56b32f94..9c42b83e7add348433635b1899087324e4e370d4 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -63,7 +63,6 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
 void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                                             Graph *graph) const {
   auto *op_desc = node->Op();
-  static int counter{0};
   auto &subgraph = *Agent(node).subgraph();
   PADDLE_ENFORCE(!subgraph.empty());
 
@@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
           block_desc.Proto()->SerializeAsString());
   SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
   SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "engine_uniq_key",
-          "trt-" + std::to_string(counter++));
   SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
 }
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index ec93729cd2b379dc2ac39b51df6799b74c8529b6..8d0d96d391efd7f0f11e9d48f5a6221431bd3824 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -15,12 +15,43 @@ macro(safe_set_static_flag)
     endforeach(flag_var)
 endmacro()
 
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+include_directories("${PADDLE_LIB}/")
+include_directories("${PADDLE_LIB}/fluid_inference_install_dir/")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
 if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+  set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+  set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+  set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
   if (WITH_STATIC_LIB)
     safe_set_static_flag()
     add_definitions(-DSTATIC_LIB)
-    set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w")
-    set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w")
   endif()
   set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 else()
@@ -29,36 +60,15 @@ else()
 endif()
 message("flags" ${CMAKE_CXX_FLAGS})
 
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
-endif()
-if(NOT DEFINED DEMO_NAME)
-  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
-endif()
-
-
 if(WITH_GPU)
   if(NOT WIN32)
     set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
   else()
     if(CUDA_LIB STREQUAL "")
-    set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+      set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
     endif()
   endif(NOT WIN32)
 endif()
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-if (NOT WIN32)
-include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
-include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-endif(NOT WIN32)
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
 
 if (NOT WIN32)
   if (USE_TENSORRT AND WITH_GPU)
@@ -67,18 +77,6 @@ if (NOT WIN32)
   endif()
 endif(NOT WIN32)
 
-if (NOT WIN32)
-link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
-link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-endif(NOT WIN32)
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/paddle/lib")
-
 if (NOT WIN32)
     set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
     if(EXISTS ${NGRAPH_PATH})
@@ -89,8 +87,6 @@ if (NOT WIN32)
     endif()
 endif()
 
-add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
-
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
   set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
@@ -106,26 +102,25 @@ endif()
 
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
-  set(DEPS
-      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
-  set(DEPS
-      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 
 if (NOT WIN32)
-set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
-    glog gflags protobuf snappystream snappy z xxhash
-    ${EXTERNAL_LIB})
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
+      glog gflags protobuf snappystream snappy z xxhash
+      ${EXTERNAL_LIB})
 else()
-set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB}
-    ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
-    ${EXTERNAL_LIB})
-# NOTE(dzhwinter) shlwapi is deprecated.
-set(DEPS ${DEPS} libcmt shlwapi)
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
+      ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
+      snappystream ${EXTERNAL_LIB})
+  # NOTE(dzhwinter) shlwapi is deprecated.
+  set(DEPS ${DEPS} libcmt shlwapi)
 endif(NOT WIN32)
 
 if(WITH_GPU)
@@ -137,9 +132,10 @@ if(WITH_GPU)
     set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
   else()
     set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
-  set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
-  set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
   endif()
 endif()
 
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index e6e7de24783b160769e0c9f43d8f0700a035c314..40ca0d287ccde113a20abb1036af289a36f54e6c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -122,6 +122,7 @@ class GpuPassStrategy : public PassStrategy {
         "conv_bn_fuse_pass",                    //
         "conv_elementwise_add_act_fuse_pass",   //
         "conv_elementwise_add2_act_fuse_pass",  //
+        "conv_elementwise_add_fuse_pass",       //
     });
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index d61d635ed707bc455d495f2420925a3585234b5c..91670ba8ac5332fe6e83b7bff14cb1a349d7e2a2 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -103,6 +103,7 @@ class OpConverter {
   void ConvertBlock(const framework::proto::BlockDesc& block,
                     const std::unordered_set<std::string>& parameters,
                     const framework::Scope& scope, TensorRTEngine* engine) {
+    std::unique_lock<std::mutex> lk(mut_);
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
       ConvertOp(op, parameters, scope, engine);
@@ -125,6 +126,7 @@ class OpConverter {
   std::unordered_map<std::string, OpConverter*> converters_;
   // fluid inference scope
   framework::Scope* scope_{nullptr};
+  std::mutex mut_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 8a4bc04b67879918c6ac8d1b40dae68a107034d4..46ce61b73611d05369f90e7d8f97e9b6724b860f 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
         ARGS --infer_model=${install_dir}/model)
 endfunction()
 
+function(inference_analysis_api_test_with_refer_result target install_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt
+             --refer_result=${install_dir}/result.txt)
+endfunction()
+
 # RNN1
 if(NOT APPLE AND WITH_MKLML)
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
@@ -83,14 +90,21 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
     inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
-inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+
+# mobilenet with transpose op
+set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
+if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
+    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
+endif()
+inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
   "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
 
 # mobilenet with depthwise_conv op
-inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
   "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
 
 # anakin
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index adaa338e289936a7e6915bd23eba86863481dd06..a8f7d5c4461964bcb18bc8df24e282ea89264aa8 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -93,18 +93,20 @@ void profile(bool use_mkldnn = false) {
   SetInput(&input_slots_all);
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                  input_slots_all, &outputs, FLAGS_num_threads);
-
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    const float ocr_result_data[] = {
-        5.273636460856323538e-08, 3.296741795111302054e-07,
-        1.873261190610264748e-08, 3.403730275408634043e-08,
-        3.383312474625199684e-08};
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
-    for (size_t i = 0; i < std::min(5UL, size); i++) {
-      EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
+    std::string line;
+    std::ifstream file(FLAGS_refer_result);
+    std::getline(file, line);
+    auto refer = ProcessALine(line);
+    file.close();
+
+    auto &output = outputs.front();
+    size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+    CHECK_EQ(numel, refer.data.size());
+    for (size_t i = 0; i < numel; ++i) {
+      CHECK_LT(
+          fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
+          1e-5);
     }
   }
 }
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 4c8bce4600a7ac47728a045a73290bba29e6f336..b07949c196ca1d41bb33a0b0499ebb3204d1be4a 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -36,6 +36,7 @@
 DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
+DEFINE_string(refer_result, "", "reference result for comparison");
 DEFINE_int32(batch_size, 1, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
index 3b7c781795f02b9d9c9f2ead51034193ceb2a745..6a92762896b89a06a91cd11fb38587f7df69e6c3 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -146,7 +146,8 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
 
     // At least use 32 threads, at most 512 threads.
     // blockx is multiple of 32.
-    int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L);
+    int blockx = std::min(
+        static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L);
     int gridx = (feature_width * num_priors + blockx - 1) / blockx;
     dim3 threads(blockx, 1);
     dim3 grids(gridx, feature_height);
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 2d262f932aed9761143f7983c9a38f7a97c374ea..862d664d42e03d2ae968ea0bdec8ae8e50bf7fb3 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -35,12 +35,12 @@ namespace operators {
 
 template <typename T>
 __device__ bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
+  return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
 }
 
 template <typename T>
 __device__ bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
+  return (a < b) || Eigen::numext::abs(a - b) < 1e-4;
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index c3974138f4d4665c46bdfccaef09c0bd84b9d028..cda102e78d2de2876d54418574b7e07211fc92b4 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -488,7 +488,7 @@ void AsyncGRPCServer::HandleRequest(
   while (true) {
     VLOG(4) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
+      LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
 
@@ -511,9 +511,8 @@ void AsyncGRPCServer::HandleRequest(
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << "completion queue:" << rpc_name
-                   << " recv no regular event"
-                   << " context:" << base->Status2String(rpc_name);
+      VLOG(4) << "completion queue:" << rpc_name << " recv no regular event"
+              << " context:" << base->Status2String(rpc_name);
       TryToRegisterNewOne(rpc_name, req_id);
       delete base;
       continue;
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index b73a32af89e882ac02623dd1d312f400a78fc47a..d212e6f8437e69e71c010b6af27a33ff5e39e1e1 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
                                                        label.data<int64_t>()));
     }
 
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto pre_out_mat = EigenMatrix<T>::From(pre_out);
-    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
-    auto out_grad_mat = EigenMatrix<T>::From(out_grad);
+    // softrelu derivative
 
-    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
 
-    // softrelu derivative
-    pre_out_grad_mat.device(place) =
-        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    auto* pre_out_grad_data = pre_out_grad.data<T>();
+    auto* pre_out_data = pre_out.data<T>();
+    auto n = pre_out.numel();
+    blas.VEXP(n, pre_out_data, pre_out_grad_data);
+    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
+    for (int64_t i = 0; i < n; ++i) {
+      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
+    }
     bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    pre_out_grad_mat.device(place) =
-        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    auto* out_grad_data = out_grad.data<T>();
+
+    int64_t dim0 = pre_out_grad.dims()[0];
+    int64_t dim1 = pre_out_grad.dims()[1];
+    for (int64_t i = 0; i < dim0; ++i) {
+      T tmp = out_grad_data[i];
+      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
+    }
     // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
     // be consistent with the clipping in forward.
 
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 4ecd8634ff41ff4eba6b5ed1d0fc78068190dce5..253b65a5f33308fc2c94537641b0fa19378b0cc9 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -124,8 +124,9 @@ REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
-    huber_loss,
-    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
+    huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 9f3a81f22cc52bef719f472e43f91bc81dfe2af6..f67f57827bc03e134bf87edd5bf033adb5098916 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -181,6 +181,9 @@ class Blas {
               const framework::Tensor& mat_b, const MatDescriptor& dim_b,
               T alpha, framework::Tensor* mat_out, T beta) const;
 
+  template <typename T>
+  void VINV(int n, const T* a, T* y) const;
+
  private:
   const DeviceContext& context_;
 };
@@ -282,6 +285,11 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template BatchedGEMM<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VINV(ARGS... args) const {
+    Base()->template VINV<T>(args...);
+  }
+
  private:
   const Blas<DeviceContext>* Base() const {
     return static_cast<const Blas<DeviceContext>*>(this);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index c84087bb1e4849b27d53e05f046c93f631150f6f..972366bc093f4b7f0a090cf31213f75ccd89fd82 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -118,6 +118,11 @@ struct CBlas<float> {
   static void VPOW(ARGS... args) {
     platform::dynload::vsPowx(args...);
   }
+
+  template <typename... ARGS>
+  static void VINV(ARGS... args) {
+    platform::dynload::vsInv(args...);
+  }
 };
 
 template <>
@@ -213,6 +218,11 @@ struct CBlas<double> {
   static void VPOW(ARGS... args) {
     platform::dynload::vdPowx(args...);
   }
+
+  template <typename... ARGS>
+  static void VINV(ARGS... args) {
+    platform::dynload::vdInv(args...);
+  }
 };
 
 #else
@@ -603,6 +613,17 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
         dim_a.stride_, dim_b.stride_);
   }
 }
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VINV(n, a, y);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = 1.0 / a[i];
+  }
+#endif
+}
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index e1e4d168db3ca594b44396a6e30c5bfc03483eaf..57726956cfba802183903b436c82b15c34d8fcc9 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index 2b3d38d95a18fad9b76e616cdf2cb6c3eb07da3a..24df1f93edd85145d703ed3277b0d1ca06e67009 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,14 +15,10 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
-
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc
index 5641f914523771f47bd7f814bfd39964a53deefc..022ffc533779363b08404b8715ac37194a4be392 100644
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@@ -14,10 +14,8 @@ limitations under the License. */
 
 #ifdef __AVX__
 
-#include <immintrin.h>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
-// TODO(qingqing) refine this dependence
-#include "paddle/legacy/cuda/src/avx_mathfun.h"
+#include "paddle/fluid/operators/math/detail/avx_mathfun.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/fluid/operators/math/detail/avx_mathfun.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7cf91134e4553dfcd935a31993e06dfa74650ac
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/avx_mathfun.h
@@ -0,0 +1,731 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+   AVX implementation of sin, cos, sincos, exp and log
+
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include "paddle/fluid/platform/cpu_info.h"
+
+/* __m128 is ugly to write */
+typedef __m256 v8sf;   // vector of 8 float (avx)
+typedef __m256i v8si;  // vector of 8 int   (avx)
+typedef __m128i v4si;  // vector of 8 int   (avx)
+
+#define _PI32AVX_CONST(Name, Val)                                          \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \
+                                                                 Val, Val}
+
+_PI32AVX_CONST(1, 1);
+_PI32AVX_CONST(inv1, ~1);
+_PI32AVX_CONST(2, 2);
+_PI32AVX_CONST(4, 4);
+
+/* declare some AVX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val)                                   \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val)                                  \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val)                       \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4);
+_PS256_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef __AVX2__
+
+typedef union imm_xmm_union {
+  v8si imm;
+  v4si xmm[2];
+} imm_xmm_union;
+
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)  \
+  {                                          \
+    imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
+    u.imm = imm_;                            \
+    xmm0_ = u.xmm[0];                        \
+    xmm1_ = u.xmm[1];                        \
+  }
+
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)  \
+  {                                          \
+    imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
+    u.xmm[0] = xmm0_;                        \
+    u.xmm[1] = xmm1_;                        \
+    imm_ = u.imm;                            \
+  }
+
+#define AVX2_BITOP_USING_SSE2(fn)                        \
+  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
+    /* use SSE2 instruction to perform the bitop AVX2 */ \
+    v4si x1, x2;                                         \
+    v8si ret;                                            \
+    COPY_IMM_TO_XMM(x, x1, x2);                          \
+    x1 = _mm_##fn(x1, a);                                \
+    x2 = _mm_##fn(x2, a);                                \
+    COPY_XMM_TO_IMM(x1, x2, ret);                        \
+    return (ret);                                        \
+  }
+
+//#warning "Using SSE2 to perform AVX2 bitshift ops"
+AVX2_BITOP_USING_SSE2(slli_epi32)
+AVX2_BITOP_USING_SSE2(srli_epi32)
+
+#define AVX2_INTOP_USING_SSE2(fn)                                     \
+  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
+    /* use SSE2 instructions to perform the AVX2 integer operation */ \
+    v4si x1, x2;                                                      \
+    v4si y1, y2;                                                      \
+    v8si ret;                                                         \
+    COPY_IMM_TO_XMM(x, x1, x2);                                       \
+    COPY_IMM_TO_XMM(y, y1, y2);                                       \
+    x1 = _mm_##fn(x1, y1);                                            \
+    x2 = _mm_##fn(x2, y2);                                            \
+    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
+    return (ret);                                                     \
+  }
+
+//#warning "Using SSE2 to perform AVX2 integer ops"
+AVX2_INTOP_USING_SSE2(and_si128)
+AVX2_INTOP_USING_SSE2(andnot_si128)
+AVX2_INTOP_USING_SSE2(cmpeq_epi32)
+AVX2_INTOP_USING_SSE2(sub_epi32)
+AVX2_INTOP_USING_SSE2(add_epi32)
+#define avx2_mm256_and_si256 avx2_mm256_and_si128
+#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128
+#else
+#define avx2_mm256_slli_epi32 _mm256_slli_epi32
+#define avx2_mm256_srli_epi32 _mm256_srli_epi32
+#define avx2_mm256_and_si256 _mm256_and_si256
+#define avx2_mm256_andnot_si256 _mm256_andnot_si256
+#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32
+#define avx2_mm256_sub_epi32 _mm256_sub_epi32
+#define avx2_mm256_add_epi32 _mm256_add_epi32
+#endif /* __AVX2__ */
+
+/* natural logarithm computed for 8 simultaneous float
+   return NaN for x <= 0
+*/
+v8sf log256_ps(v8sf x) {
+  v8si imm0;
+  v8sf one = *(v8sf *)_ps256_1;
+
+  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
+
+  x = _mm256_max_ps(
+      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
+
+  // can be done with AVX2
+  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
+
+  // this is again another AVX2 instruction
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  v8sf e = _mm256_cvtepi32_ps(imm0);
+
+  e = _mm256_add_ps(e, one);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf tmp = _mm256_and_ps(x, mask);
+  x = _mm256_sub_ps(x, one);
+  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
+  x = _mm256_add_ps(x, tmp);
+
+  v8sf z = _mm256_mul_ps(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
+  y = _mm256_mul_ps(y, x);
+
+  y = _mm256_mul_ps(y, z);
+
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
+
+  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
+  x = _mm256_add_ps(x, y);
+  x = _mm256_add_ps(x, tmp);
+  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
+  return x;
+}
+
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS256_CONST(cephes_exp_C1, 0.693359375);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v8sf exp256_ps(v8sf x) {
+  v8sf tmp = _mm256_setzero_ps(), fx;
+  v8si imm0;
+  v8sf one = *(v8sf *)_ps256_1;
+
+  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+  // imm0 = _mm256_cvttps_epi32(fx);
+  // tmp  = _mm256_cvtepi32_ps(imm0);
+
+  tmp = _mm256_floor_ps(fx);
+
+  /* if greater, substract 1 */
+  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+  mask = _mm256_and_ps(mask, one);
+  fx = _mm256_sub_ps(tmp, mask);
+
+  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
+  x = _mm256_sub_ps(x, tmp);
+  x = _mm256_sub_ps(x, z);
+
+  z = _mm256_mul_ps(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, x);
+  y = _mm256_add_ps(y, one);
+
+  /* build 2^n */
+  imm0 = _mm256_cvttps_epi32(fx);
+  // another two AVX2 instructions
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_slli_epi32(imm0, 23);
+  v8sf pow2n = _mm256_castsi256_ps(imm0);
+  y = _mm256_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS256_CONST(sincof_p0, -1.9515295891E-4);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
+_PS256_CONST(sincof_p2, -1.6666654611E-1);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003);
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
+
+/* evaluation of 8 sines at onces using AVX intrisics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+v8sf sin256_ps(v8sf x) {  // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
+/*
+  Here we start a series of integer operations, which are in the
+  realm of AVX2.
+  If we don't have AVX, let's perform them using SSE2 directives
+*/
+
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  // another two AVX2 instruction
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  /* get the swap sign flag */
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y, y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+v8sf cos256_ps(v8sf x) {  // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+
+  /* get the swap sign flag */
+  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask */
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+#else
+
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
+
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+  v8sf sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y, y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
+   replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
+  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
+  v8si imm0, imm2, imm4;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+  v4si imm4_1, imm4_2;
+#endif
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+  /* store the integer part of y in imm2 */
+  imm2 = _mm256_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+
+  y = _mm256_cvtepi32_ps(imm2);
+  imm4 = imm2;
+
+  /* get the swap sign flag for the sine */
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+
+  /* get the polynom selection mask for the sine*/
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+// v8sf poly_mask = _mm256_castsi256_ps(imm2);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm4_1 = imm2_1;
+  imm4_2 = imm2_2;
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+#ifdef __AVX2__
+  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
+  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
+  imm4 = avx2_mm256_slli_epi32(imm4, 29);
+#else
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
+
+  imm4_1 = _mm_slli_epi32(imm4_1, 29);
+  imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+#endif
+
+  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v8sf z = _mm256_mul_ps(x, x);
+  y = *(v8sf *)_ps256_coscof_p0;
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
+  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
+  y2 = _mm256_sub_ps(y2, ysin2);
+  y = _mm256_sub_ps(y, ysin1);
+
+  xmm1 = _mm256_add_ps(ysin1, ysin2);
+  xmm2 = _mm256_add_ps(y, y2);
+
+  /* update the sign */
+  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
+}
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 78d0c3e8808f0daf6a18d2217664e965773b95ff..2b08c1059713fb9acd0cfdcf39ac2ad283172724 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -113,26 +113,27 @@ void VXXJitCode::generate() {
   ret();
 }
 
-const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
-                                          REPEAT_8TIMES(2.f),
-                                          REPEAT_8TIMES(0.5f),
-                                          REPEAT_8TIMES(EXP_HIG),
-                                          REPEAT_8TIMES(EXP_LOW),
-                                          REPEAT_8TIMES(CEPHES_LOG2EF),
-                                          REPEAT_8TIMES(CEPHES_EXP_C1),
-                                          REPEAT_8TIMES(CEPHES_EXP_C2),
-                                          REPEAT_8TIMES(CEPHES_EXP_P0),
-                                          REPEAT_8TIMES(CEPHES_EXP_P1),
-                                          REPEAT_8TIMES(CEPHES_EXP_P2),
-                                          REPEAT_8TIMES(CEPHES_EXP_P3),
-                                          REPEAT_8TIMES(CEPHES_EXP_P4),
-                                          REPEAT_8TIMES(CEPHES_EXP_P5),
-                                          REPEAT_8TIMES(EXP_MAX_INPUT),
-                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
+const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
+    REPEAT_8TIMES(1.f),
+    REPEAT_8TIMES(2.f),
+    REPEAT_8TIMES(0.5f),
+    REPEAT_8TIMES(EXP_HIG),
+    REPEAT_8TIMES(EXP_LOW),
+    REPEAT_8TIMES(CEPHES_LOG2EF),
+    REPEAT_8TIMES(CEPHES_EXP_C1),
+    REPEAT_8TIMES(CEPHES_EXP_C2),
+    REPEAT_8TIMES(CEPHES_EXP_P0),
+    REPEAT_8TIMES(CEPHES_EXP_P1),
+    REPEAT_8TIMES(CEPHES_EXP_P2),
+    REPEAT_8TIMES(CEPHES_EXP_P3),
+    REPEAT_8TIMES(CEPHES_EXP_P4),
+    REPEAT_8TIMES(CEPHES_EXP_P5),
+    REPEAT_8TIMES(EXP_MAX_INPUT),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
 
-const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
-int g_tmp_mem[16] ALIGN32 = {0};
+const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
+int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
 
 bool VActJitCode::init(int d, operand_type type) {
   // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index e2b4761435594fdc952ff5dba5b5fa4f4aa98e6c..6d22bf675724166d0701e9a51d0d23ae00ef1048 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -47,7 +47,6 @@ extern const float exp_float_consts[];
 extern const int exp_int_0x7f[];
 extern int g_tmp_mem[];
 
-#define ALIGN32 __attribute__((aligned(32)))
 #define EXP_HIG 88.3762626647949f
 #define EXP_LOW -88.3762626647949f
 #define CEPHES_LOG2EF 1.44269504088896341
diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
index eeb305a88bee8f0e21b205684d24b19ca4631f65..ac2d29f1c18392ebf917cc097e63670e06b1eded 100644
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 namespace paddle {
 namespace operators {
@@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
           /* AVX instructions.*/                                               \
           __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);               \
           __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);               \
-          __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);        \
-          __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);        \
+          __m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0);     \
+          __m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1);     \
           lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);                      \
           hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);                      \
           lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));                 \
diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
index cb49e66488bd69d92430cbf6de1d08348ffe0202..e21092037a27d26cd31205b1b5d8e2f0cb8380cd 100644
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
@@ -13,9 +13,6 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 namespace paddle {
 namespace operators {
@@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
       if (rest_ != 0) {                                                        \
         j = offset + this->num_ - block;                                       \
         tmp = _mm256_loadu_ps((const float*)x + j);                            \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec);    \
+        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
         sum = _mm256_add_ps(sum, tmp);                                         \
       }                                                                        \
       hi = _mm256_extractf128_ps(sum, 1);                                      \
@@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
         j = offset + this->num_ - block;                                       \
         tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
         tmp = _mm256_mul_ps(tmp, tmp);                                         \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec);    \
+        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
         sum = _mm256_add_ps(sum, tmp);                                         \
       }                                                                        \
       hi = _mm256_extractf128_ps(sum, 1);                                      \
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 5a6e64b6f87d33249f0153e5f391deaf78e53de5..d55e832cc2d9a4a5e2cb7fe5cf451a1205601951 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -14,218 +14,380 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include <iostream>
+#include <map>
+
 namespace paddle {
 namespace operators {
 namespace math {
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Add(const framework::Tensor& vec,
-                                  framework::Tensor* tmat) {
-  size_t batch_size = tmat->dims()[0];
-  size_t width = tmat->dims()[1];
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      tmat->data<T>()[i * width + j] += vec.data<T>()[index];
+struct MatrixBitCodeFunctorAdd : public boost::static_visitor<void> {
+  const framework::Tensor &vec_;
+  framework::Tensor *tmat_;
+
+  MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat)
+      : vec_(vec), tmat_(tmat) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t batch_size = tmat_->dims()[0];
+    size_t width = tmat_->dims()[1];
+    auto *tmat_data = tmat_->data<T>();
+    auto *vec_data = vec_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        tmat_data[i * width + j] += vec_data[index];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Add(const framework::Tensor &vec,
+                                  framework::Tensor *tmat) {
+  MatrixBitCodeFunctorAdd<T> func(vec, tmat);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
-                                      framework::Tensor* vec) {
-  size_t batch_size = tmat.dims()[0];
-  size_t width = tmat.dims()[1];
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      vec->data<T>()[index] += tmat.data<T>()[i * width + j];
+struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *vec_;
+  MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat,
+                              framework::Tensor *vec)
+      : tmat_(tmat), vec_(vec) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &table) {
+    size_t batch_size = tmat_.dims()[0];
+    size_t width = tmat_.dims()[1];
+    auto *vec_data = vec_->data<T>();
+    auto *tmat_data = tmat_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        vec_data[index] += tmat_data[i * width + j];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
+                                      framework::Tensor *vec) {
+  MatrixBitCodeFunctorAddGrad<T> func(tmat, vec);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
-                                      framework::SelectedRows* vec) {
-  size_t batch_size = tmat.dims()[0];
-  size_t width = tmat.dims()[1];
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      int64_t row_index = vec->GetIndexFromId(static_cast<int64_t>(index));
-      vec->mutable_value()->data<T>()[row_index] +=
-          tmat.data<T>()[i * width + j];
+struct MatrixBitCodeFunctorSelectedRowsAddGrad
+    : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::SelectedRows *vec_;
+
+  MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat,
+                                          framework::SelectedRows *vec)
+      : tmat_(tmat), vec_(vec) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t batch_size = tmat_.dims()[0];
+    size_t width = tmat_.dims()[1];
+    auto *vec_data = vec_->mutable_value()->template data<T>();
+    auto *tmat_data = tmat_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        int64_t row_index = vec_->GetIndexFromId(static_cast<int64_t>(index));
+        vec_data[row_index] += tmat_data[i * width + j];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
+                                      framework::SelectedRows *vec) {
+  MatrixBitCodeFunctorSelectedRowsAddGrad<T> func(tmat, vec);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
-                                  framework::Tensor* sum, T scale_sum) {
-  size_t num_samples = tmat.dims()[0];
-  size_t o_width = tmat.dims()[1];
-  for (size_t i = 0; i < num_samples; ++i) {
-    T sm = static_cast<T>(0.0);
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      if (code->calc_bit(j)) {
-        // calc_bit starts from right most bit, while data in tmat[i] is in the
-        // reverse order.
-        sm += tmat.data<T>()[i * o_width + j];
+struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *sum_;
+  T scale_sum_;
+
+  MatrixBitCodeFunctorSum(const framework::Tensor &tmat, framework::Tensor *sum,
+                          T scale_sum)
+      : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_.dims()[0];
+    size_t o_width = tmat_.dims()[1];
+    auto *tmat_data = tmat_.data<T>();
+    auto *sum_data = sum_->data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      T sm = static_cast<T>(0.0);
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        if (code.calc_bit(j)) {
+          // calc_bit starts from right most bit, while data in tmat[i] is in
+          // the
+          // reverse order.
+          sm += tmat_data[i * o_width + j];
+        }
       }
+      sum_data[i] = scale_sum_ * sm;
     }
-    sum->data<T>()[i] = scale_sum * sm;
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor &tmat,
+                                  framework::Tensor *sum, T scale_sum) {
+  MatrixBitCodeFunctorSum<T> func(tmat, sum, scale_sum);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
-                                  const framework::Tensor& weight,
-                                  const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat->dims()[0];
-  size_t tmat_width = tmat->dims()[1];
-  size_t input_width = input.dims()[1];
-  size_t weight_width = weight.dims()[1];
-  auto tmat_value = tmat->data<T>();
-  auto weight_value = weight.data<T>();
-  auto input_value = input.data<T>();
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_row = input_value + input_width * i;
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      const T* weight_row = weight_value + weight_width * index;
-      T sum = static_cast<T>(0.0);
-      sum = blas.DOT(input_width, weight_row, input_row);
-      tmat_value[i * tmat_width + j] += sum;
+struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
+  framework::Tensor *tmat_;
+  const framework::Tensor &weight_;
+  const framework::Tensor &input_;
+
+  MatrixBitCodeFunctorMul(framework::Tensor *tmat,
+                          const framework::Tensor &weight,
+                          const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_->dims()[0];
+    size_t tmat_width = tmat_->dims()[1];
+    size_t input_width = input_.dims()[1];
+    size_t weight_width = weight_.dims()[1];
+    auto tmat_value = tmat_->data<T>();
+    auto weight_value = weight_.data<T>();
+    auto input_value = input_.data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_row = input_value + input_width * i;
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        const T *weight_row = weight_value + weight_width * index;
+        T sum = blas.DOT(input_width, weight_row, input_row);
+        tmat_value[i * tmat_width + j] += sum;
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor *tmat,
+                                  const framework::Tensor &weight,
+                                  const framework::Tensor &input) {
+  MatrixBitCodeFunctorMul<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
+template <typename T, size_t N>
+class ReservedVector : public std::vector<T> {
+ public:
+  ReservedVector() { this->reserve(N); }
+};
+
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
-                                            framework::Tensor* weight,
-                                            const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat.dims()[0];
-  size_t input_width = input.dims()[1];
-  size_t tmat_width = tmat.dims()[1];
-  size_t weight_width = weight->dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight->data<T>();
-  auto input_value = input.data<T>();
-
-  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
-
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_value_row = input_value + input_width * i;
-    const T* tmat_row = tmat_value + i * tmat_width;
-    for (int j = 0; j < code_length; ++j) {
-      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *weight_;
+  const framework::Tensor &input_;
+  MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat,
+                                    framework::Tensor *weight,
+                                    const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_.dims()[0];
+    size_t input_width = input_.dims()[1];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t weight_width = weight_->dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_->data<T>();
+    auto input_value = input_.data<T>();
+
+    std::map<int, ReservedVector<std::pair<T, const T *>, 8u>> ops;
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_value_row = input_value + input_width * i;
+      const T *tmat_row = tmat_value + i * tmat_width;
+      for (int j = 0; j < code_length; ++j) {
+        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+      }
     }
-  }
-  for (auto& op : ops) {
-    auto& op_in_row = op.second;
-    for (auto& pair : op_in_row) {
-      auto& scale = pair.first;
-      auto* input_row = pair.second;
-      T* weight_row = weight_value + op.first * weight_width;
-      blas.AXPY(input_width, scale, input_row, weight_row);
+    for (auto &op : ops) {
+      auto &op_in_row = op.second;
+      for (auto &pair : op_in_row) {
+        auto &scale = pair.first;
+        auto *input_row = pair.second;
+        T *weight_row = weight_value + op.first * weight_width;
+        blas.AXPY(input_width, scale, input_row, weight_row);
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+                                            framework::Tensor *weight,
+                                            const framework::Tensor &input) {
+  MatrixBitCodeFunctorMulGradWeight<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
-                                            framework::SelectedRows* weight,
-                                            const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat.dims()[0];
-  size_t input_width = input.dims()[1];
-  size_t tmat_width = tmat.dims()[1];
-  size_t weight_width = weight->value().dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight->mutable_value()->data<T>();
-  auto input_value = input.data<T>();
-
-  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
-  ops.reserve(weight->rows().size());
-
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_value_row = input_value + input_width * i;
-    const T* tmat_row = tmat_value + i * tmat_width;
-    for (int j = 0; j < code_length; ++j) {
-      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+struct MatrixBitCodeFunctorMulGradWeightSR
+    : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::SelectedRows *weight_;
+  const framework::Tensor &input_;
+
+  MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat,
+                                      framework::SelectedRows *weight,
+                                      const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_.dims()[0];
+    size_t input_width = input_.dims()[1];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t weight_width = weight_->value().dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_->mutable_value()->data<T>();
+    auto input_value = input_.data<T>();
+
+    std::unordered_map<int, std::vector<std::pair<T, const T *>>> ops;
+    ops.reserve(weight_->rows().size());
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_value_row = input_value + input_width * i;
+      const T *tmat_row = tmat_value + i * tmat_width;
+      for (int j = 0; j < code_length; ++j) {
+        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+      }
     }
-  }
 
-  for (auto& row : weight->rows()) {
-    auto& op_in_row = ops[row];
-    for (auto& pair : op_in_row) {
-      auto& scale = pair.first;
-      auto* input_row = pair.second;
-      blas.AXPY(input_width, scale, input_row, weight_value);
+    for (auto &row : weight_->rows()) {
+      auto &op_in_row = ops[row];
+      for (auto &pair : op_in_row) {
+        auto &scale = pair.first;
+        auto *input_row = pair.second;
+        blas.AXPY(input_width, scale, input_row, weight_value);
+      }
+      weight_value += weight_width;
     }
-    weight_value += weight_width;
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+                                            framework::SelectedRows *weight,
+                                            const framework::Tensor &input) {
+  MatrixBitCodeFunctorMulGradWeightSR<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
-                                           const framework::Tensor& weight,
-                                           framework::Tensor* input) {
-  size_t num_samples = tmat.dims()[0];
-  size_t tmat_width = tmat.dims()[1];
-  size_t input_width = input->dims()[1];
-  size_t weight_width = weight.dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight.data<T>();
-  auto input_value = input->data<T>();
-
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-
-      for (size_t k = 0; k < input_width; ++k) {
-        input_value[input_width * i + k] +=
-            tmat_value[i * tmat_width + j] *
-            weight_value[weight_width * index + k];
+struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  const framework::Tensor &weight_;
+  framework::Tensor *input_;
+
+  MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat,
+                                   const framework::Tensor &weight,
+                                   framework::Tensor *input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_.dims()[0];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t input_width = input_->dims()[1];
+    size_t weight_width = weight_.dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_.data<T>();
+    auto input_value = input_->data<T>();
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+
+        for (size_t k = 0; k < input_width; ++k) {
+          input_value[input_width * i + k] +=
+              tmat_value[i * tmat_width + j] *
+              weight_value[weight_width * index + k];
+        }
       }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor &tmat,
+                                           const framework::Tensor &weight,
+                                           framework::Tensor *input) {
+  MatrixBitCodeFunctorMulGradError<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
-  size_t num_samples = tmat->dims()[0];
-  size_t o_width = tmat->dims()[1];
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      if (code->calc_bit(j)) {
-        tmat->data<T>()[i * o_width + j] -= 1;
+struct MatrixBitCodeFunctorSub : public boost::static_visitor<void> {
+  framework::Tensor *tmat_;
+
+  explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_->dims()[0];
+    size_t o_width = tmat_->dims()[1];
+    auto *tmat_data = tmat_->data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        if (code.calc_bit(j)) {
+          tmat_data[i * o_width + j] -= 1;
+        }
       }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor *tmat) {
+  MatrixBitCodeFunctorSub<T> func(tmat);
+  code_table_.apply_visitor(func);
 }
 
 template class MatrixBitCodeFunctor<float>;
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 35ca73802b48982ddf3ed7485b56f50221c9f28c..01e4889d34ad6e409f1b8a9c4bf783800187e863 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <map>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -22,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/variant.h"
 
 #if defined(_WIN32)
 #include <intrin.h>
@@ -98,24 +100,7 @@ inline int clz(const T& value) {
 
 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-// set a code interface to create multiple code
-class Code {
- public:
-  virtual ~Code() {}
-  virtual size_t calc_index(int bit) const = 0;
-  virtual bool calc_bit(int bit) const = 0;
-  virtual int get_length() const = 0;
-};
-// set a CodeTable interface to create multiple code table
-class CodeTable {
- public:
-  virtual std::unique_ptr<Code> get_code(int64_t code) const = 0;
-  virtual size_t size() const = 0;
-  virtual int get_max_code_length() const = 0;
-  virtual ~CodeTable() {}
-};
-
-class SimpleCode : public Code {
+class SimpleCode {
  public:
   SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
       : c_(static_cast<size_t>(ids[code]) + num_classes) {}
@@ -137,16 +122,16 @@ class SimpleCode : public Code {
 };
 
 template <typename T>
-class CustomCode : public Code {
+class CustomCode {
  public:
   CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
-             const int64_t* ids, int index)
-      : ids_(ids), index_(index) {
-    ptable_ = ptable.Slice(index, index + 1);
-    pcode_ = pcode.Slice(index, index + 1);
+             const int64_t* ids, int index) {
+    seq_len_ = ptable.dims()[1];
+    ptable_data_ = ptable.data<T>() + seq_len_ * index;
+    pcode_data_ = pcode.data<T>() + seq_len_ * index;
   }
   /**
-   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * Here the id of root should be 1 rather than 0, thus the encoding of class c
    * is `c + num_classes` and all siblings can get the same weight indice using
    * prefixes.
    * Weight index is the prefixes of encoding, thus leave out the right most
@@ -154,36 +139,37 @@ class CustomCode : public Code {
    * Binary classification path is the suffixes of encoding, thus leave out the
    * left most bit in calc_bit.
    */
-  size_t calc_index(int bit) const { return ptable_.data<T>()[bit]; }
-  bool calc_bit(int bit) const { return pcode_.data<T>()[bit]; }
-  int get_length() const {
-    int length = 0;
+  size_t calc_index(int bit) const { return ptable_data_[bit]; }
+  bool calc_bit(int bit) const { return pcode_data_[bit]; }
 
-    for (int i = 0; i < static_cast<int>(ptable_.dims()[1]); i++) {
-      if (ptable_.data<T>()[i] >= 0) {
-        length++;
-      } else {
-        return length;
-      }
+  // NOTE: this function is not thread-safe.
+  int get_length() const {
+    if (length_ < 0) {
+      auto len = seq_len_;
+      length_ =
+          static_cast<int>(std::find_if(ptable_data_, ptable_data_ + len,
+                                        [](const T& val) { return val < 0; }) -
+                           ptable_data_);
     }
-    return length;
+    return length_;
   }
 
  private:
-  framework::Tensor ptable_;
-  framework::Tensor pcode_;
-  const int64_t* ids_;
-  const int index_;
+  int64_t seq_len_;
+  const T* ptable_data_;
+  const T* pcode_data_;
+  mutable int length_{-1};
 };
 
-class SimpleCodeTable : public CodeTable {
+class SimpleCodeTable {
  public:
   SimpleCodeTable(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes), ids_(ids) {}
-  std::unique_ptr<Code> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new SimpleCode(code, num_classes_, ids_));
-    return coder;
+
+  SimpleCode get_code(int64_t code) const {
+    return SimpleCode(code, num_classes_, ids_);
   }
+
   size_t size() const { return num_classes_; }
   int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
 
@@ -193,15 +179,14 @@ class SimpleCodeTable : public CodeTable {
 };
 
 template <typename T>
-class CustomCodeTable : public CodeTable {
+class CustomCodeTable {
  public:
   CustomCodeTable(const framework::Tensor& ptable,
                   const framework::Tensor& pcode, const int64_t* ids)
       : ptable_(ptable), pcode_(pcode), ids_(ids) {}
 
-  std::unique_ptr<Code> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new CustomCode<T>(ptable_, pcode_, ids_, code));
-    return coder;
+  CustomCode<T> get_code(int64_t code) const {
+    return CustomCode<T>(ptable_, pcode_, ids_, code);
   }
 
   size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
@@ -215,19 +200,21 @@ class CustomCodeTable : public CodeTable {
   const int64_t* ids_;
 };
 
+using CodeTable = boost::variant<SimpleCodeTable, CustomCodeTable<int64_t>>;
+
 template <typename T>
 class MatrixBitCodeFunctor {
  public:
   MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes),
         ids_(ids),
-        code_table_(new SimpleCodeTable(num_classes, ids)) {}
+        code_table_(SimpleCodeTable(num_classes, ids)) {}
 
   MatrixBitCodeFunctor(const framework::Tensor& ptable,
                        const framework::Tensor& pcode, const int64_t* ids)
       : num_classes_(static_cast<size_t>(ptable.dims()[1])),
         ids_(ids),
-        code_table_(new CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
+        code_table_(CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
   /* For j < code_length
        tmat(i, j) += vec(0, index(i, j))
   */
@@ -277,7 +264,7 @@ class MatrixBitCodeFunctor {
 
   size_t num_classes_;
   const int64_t* ids_;
-  std::unique_ptr<CodeTable> code_table_;
+  CodeTable code_table_;
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt
index eee0b90fbae216e804e62993313796e914fcef5a..6b551d13f1dc5cd1c82a15a8347b278e8f795c1c 100644
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
@@ -1,5 +1,5 @@
 op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
-file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
+file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
 nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
   DEPS tensorrt_engine_op
   analysis)
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index 3cf2ce3c7ef87dcf75548f7d9c3a55d06ed765e8..b993c55fad13e892efd51648b78704bec83bf2b4 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -21,8 +21,6 @@
 
 namespace paddle {
 
-DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
-
 namespace operators {
 
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -31,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Xs", "A list of inputs.").AsDuplicable();
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
-    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
     AddAttr<int>("max_batch_size", "the maximum batch size.");
     AddAttr<int>("workspace_size", "the workspace size.");
     AddComment("TensorRT engine operator.");
@@ -50,6 +47,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
-                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
+                  ops::TensorRTEngineOpMaker);
 
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
deleted file mode 100644
index cbe1b426f65386e722a7b02ec1fdfdf75bfd770c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    tensorrt_engine,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 5b2aad55a4e4b640c614ad7639c83b49cef3dc07..88c4f508474e66953b79fb92ff1eb0b53a539f07 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -27,8 +27,6 @@
 
 namespace paddle {
 
-DECLARE_int32(tensorrt_engine_batch_size);
-
 namespace operators {
 
 using FluidDT = framework::proto::VarType_Type;
@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
   return TRT_DT::kINT32;
 }
 
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
   PADDLE_ENFORCE_GT(shape.size(), 1UL,
                     "TensorRT' tensor input requires at least 2 dimensions");
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
@@ -63,128 +61,119 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
 }  // namespace // NOLINT
 
 using inference::Singleton;
-using inference::tensorrt::TRT_EngineManager;
+using inference::tensorrt::TensorRTEngine;
+
+class TensorRTEngineOp : public framework::OperatorBase {
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  mutable std::unique_ptr<TensorRTEngine> trt_engine_;
+  int max_batch_size_;
+  int workspace_size_;
 
-class TensorRTEngineOp : public framework::OperatorWithKernel {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  TensorRTEngineOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    max_batch_size_ = Attr<int>("max_batch_size");
+    workspace_size_ = Attr<int>("workspace_size");
+
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+  }
 
  protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input0 = ctx.Inputs("Xs").front();
-    framework::OpKernelType kt = framework::OpKernelType(
-        ctx.scope().FindVar(input0)->GetMutable<framework::LoDTensor>()->type(),
-        ctx.GetPlace());
-    return kt;
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    RunTrt(scope, dev_place);
   }
-};
 
-template <typename DeviceContext, typename T>
-class TensorRTEngineKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto engine_name = context.Attr<std::string>("engine_uniq_key");
-    int max_batch_size = context.Attr<int>("max_batch_size");
-    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
-      Prepare(context);
+  void RunTrt(const framework::Scope &scope,
+              const platform::Place &dev_place) const {
+    int runtime_batch = 1;
+    if (trt_engine_.get() == nullptr) {
+      trt_engine_.reset(new TensorRTEngine(
+          max_batch_size_, workspace_size_, nullptr,
+          boost::get<platform::CUDAPlace>(dev_place).device));
+      Prepare(scope, dev_place, trt_engine_.get());
     }
-    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
-    auto input_names = context.op().Inputs("Xs");
-    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
+
+    auto *engine = trt_engine_.get();
+    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
 
     std::vector<std::string> output_maps =
-        context.Attr<std::vector<std::string>>("output_name_mapping");
+        Attr<std::vector<std::string>>("output_name_mapping");
 
-    auto params = context.Attr<std::vector<std::string>>("parameters");
-    std::unordered_set<std::string> parameters;
-    for (const auto& param : params) {
-      parameters.insert(param);
-    }
     // Convert input tensor from fluid to engine.
-    for (const auto& x : context.Inputs("Xs")) {
-      if (parameters.count(x)) continue;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
       // convert input and copy to TRT engine's buffer
-      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
-          context.scope(), x);
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      auto t_shape = framework::vectorize(t.dims());
+      runtime_batch = t_shape[0];
       if (platform::is_cpu_place(t.place())) {
-        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+        engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
                                 t.memory_size());
       } else {
-        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+        engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
                                 t.memory_size());
       }
     }
+
+    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
     // Execute the engine.
-    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
-    engine->Execute(FLAGS_tensorrt_engine_batch_size);
+    engine->Execute(runtime_batch);
 
     // Convert output tensor from engine to fluid
     int output_index = 0;
     VLOG(4) << "TensorRT Engine Op Outputs:";
-    for (const auto& y : context.Outputs("Ys")) {
+    for (const auto &y : Outputs("Ys")) {
       VLOG(4) << y;
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
+      nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       // The ITensor doesn't contain the batch size dim.
       std::vector<int> ddim;
-      ddim.push_back(FLAGS_tensorrt_engine_batch_size);
+      ddim.push_back(runtime_batch);
       for (int i = 0; i < dims.nbDims; i++) {
         ddim.push_back(dims.d[i]);
       }
 
-      auto* fluid_v = context.scope().FindVar(y);
+      auto *fluid_v = scope.FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
-      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
 
       fluid_t->Resize(framework::make_ddim(ddim));
 
-      // TODO(Superjomn) find some way to determine which device to output the
-      // tensor.
-      // if (platform::is_cpu_place(fluid_t->place())) {
       // TODO(Superjomn) change this float to dtype size.
-      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
-                  FLAGS_tensorrt_engine_batch_size;
+      auto size =
+          inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch;
       engine->GetOutputInGPU(
           output_maps[output_index],
           fluid_t->mutable_data<float>(platform::CUDAPlace(
-              boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
+              boost::get<platform::CUDAPlace>(dev_place).device)),
           size * sizeof(float));
-
       output_index += 1;
     }
 
     cudaStreamSynchronize(*engine->stream());
   }
 
- protected:
-  void Prepare(const framework::ExecutionContext& context) const {
+  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
+               TensorRTEngine *engine) const {
     VLOG(4) << "Prepare engine";
-    // Get the ProgramDesc and pass to convert.
     framework::proto::BlockDesc block_desc;
-    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-    int max_batch_size = context.Attr<int>("max_batch_size");
-    int workspace_size = context.Attr<int>("workspace_size");
-
-    auto params = context.Attr<std::vector<std::string>>("parameters");
-    std::unordered_set<std::string> parameters;
-    for (const auto& param : params) {
-      parameters.insert(param);
-    }
+    block_desc.ParseFromString(Attr<std::string>("subgraph"));
 
     std::vector<std::string> output_maps =
-        context.Attr<std::vector<std::string>>("output_name_mapping");
-
-    // TODO(Superjomn) replace this with a different stream
-    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
-        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
-        context.Attr<std::string>("engine_uniq_key"),
-        boost::get<platform::CUDAPlace>(context.GetPlace()).device);
+        Attr<std::vector<std::string>>("output_name_mapping");
 
     engine->InitNetwork();
 
@@ -192,39 +181,33 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     VLOG(4) << "parsed var size " << block.AllVars().size();
     // Add inputs
     VLOG(4) << "declare inputs";
-    for (auto& input : context.Inputs("Xs")) {
-      if (parameters.count(input)) continue;
+    for (auto &input : Inputs("Xs")) {
+      if (param_names_.count(input)) continue;
       VLOG(4) << "declare input " << input;
-      auto* var = block.FindVar(input);
+
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
+      auto t_shape = framework::vectorize(t.dims());
+
+      auto *var = block.FindVar(input);
       // TensorRT engine need to create parameters. The parameter's description
       // should be set in
       PADDLE_ENFORCE(var, "no variable called %s", input);
       PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                         "TensorRT engine only takes LoDTensor as input");
-      auto shape = var->GetShape();
-      // For the special batch_size placeholder -1, drop it and pass the real
-      // shape of data.
-      // TODO(Superjomn) fix this with batch broadcast, or it can't handle
-      // variational batch size.
-      if (shape[0] == -1) {
-        shape[0] = FLAGS_tensorrt_engine_batch_size;
-      }
+
       engine->DeclareInput(
           input, FluidDataType2TRT(
                      var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(shape));
+          Vec2TRT_Dims(t_shape));
     }
-
     inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlock(block_desc, parameters, context.scope(), engine);
+        .ConvertBlock(block_desc, param_names_, scope, engine);
 
     // Add outputs
-    for (auto& output : output_maps) {
-      if (!engine->HasDeclared(output)) {
-        engine->DeclareOutput(output);
-      }
+    for (auto &output : output_maps) {
+      engine->DeclareOutput(output);
     }
-
     engine->FreezeNetwork();
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 56bdd6c2f2801967829f2baf889b5517a1d9d8d9..287b0edc96e5e312b0ff1725ee188ff319d44d23 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -24,8 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
-USE_CUDA_ONLY_OP(tensorrt_engine);
-
+USE_NO_KERNEL_OP(tensorrt_engine);
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 23c7ebe84221986a5f7ac7583c3a8e17d04fe4af..2f205e1d5ca30d67a55e4df0f5e879ffef9a9c26 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -67,6 +67,13 @@ ENDIF()
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+if(WIN32)
+    if(WITH_GPU AND NOT WITH_DSO)
+        get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+        target_link_libraries(device_context ${cuda_modules})
+    endif(WITH_GPU AND NOT WITH_DSO)
+endif(WIN32)
+
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 cc_test(init_test SRCS init_test.cc DEPS device_context)
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 55dba545ff133b1c219ee58f6d1bb2d2130d1a59..c70e3be858fe72f298a5e553bcca189641392cdc 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -16,6 +16,26 @@ limitations under the License. */
 
 #include <stddef.h>
 
+#ifdef _WIN32
+#if defined(__AVX2__)
+#include <immintrin.h>  //avx2
+#elif defined(__AVX__)
+#include <intrin.h>  //avx
+#endif               // AVX
+#else                // WIN32
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+#endif  // WIN32
+
+#if defined(_WIN32)
+#define ALIGN32_BEG __declspec(align(32))
+#define ALIGN32_END
+#else
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
+#endif  // _WIN32
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index f0a973662360fd9ff35e1006cce937d86f3e563c..c3f9433503accf98d30ccaa57b9b4b8f3c68666a 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -82,6 +82,8 @@ extern void* mklml_dso_handle;
   __macro(vdSqr);                   \
   __macro(vsPowx);                  \
   __macro(vdPowx);                  \
+  __macro(vsInv);                   \
+  __macro(vdInv);                   \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 466c77469ef256179c52442d21c1d62dfc4ef1bb..5a9e24374f6f777c2286b8928eae9dcbe8be6378 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -19,7 +19,7 @@ namespace paddle {
 namespace platform {
 
 #if CUDA_VERSION >= 10000
-static void CUDART_CB StreamCallbackFunc(void *user_data);
+static void CUDART_CB StreamCallbackFunc(void *user_data)
 #else
 static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
                                          cudaError_t status, void *user_data)
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b8954cb12628d1f4f333956e0213ddf9c01e592c..c79d5d9403db613a8cdda59b9874a8b886458357 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -19,10 +19,6 @@ if(WITH_PYTHON)
   endif(WITH_AMD_GPU)
 
   if(WIN32)
-    if(WITH_GPU AND NOT WITH_DSO)
-      get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
-      target_link_libraries(paddle_pybind ${cuda_modules})
-    endif(WITH_GPU AND NOT WITH_DSO)
     target_link_libraries(paddle_pybind shlwapi)
   endif(WIN32)
 
diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc
index 470e8b050808295d49728bbdb757b6a612df9a01..222c128c66f37a259eb17527fe2586860f701275 100644
--- a/paddle/fluid/pybind/async_executor_py.cc
+++ b/paddle/fluid/pybind/async_executor_py.cc
@@ -41,6 +41,23 @@ namespace pd = paddle::framework;
 namespace paddle {
 namespace pybind {
 using set_name_func = void (pd::DataFeedDesc::*)(const std::string&);
+#ifdef PADDLE_WITH_PSLIB
+void BindAsyncExecutor(py::module* m) {
+  py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
+      .def(py::init([](framework::Scope* scope, const platform::Place& place) {
+        return std::unique_ptr<framework::AsyncExecutor>(
+            new framework::AsyncExecutor(scope, place));
+      }))
+      .def("run_from_files", &framework::AsyncExecutor::RunFromFile)
+      .def("init_server", &framework::AsyncExecutor::InitServer)
+      .def("init_worker", &framework::AsyncExecutor::InitWorker)
+      .def("start_server", &framework::AsyncExecutor::StartServer)
+      .def("stop_server", &framework::AsyncExecutor::StopServer)
+      .def("gather_servers", &framework::AsyncExecutor::GatherServers)
+      .def("init_model", &framework::AsyncExecutor::InitModel)
+      .def("save_model", &framework::AsyncExecutor::SaveModel);
+}  // end BindAsyncExecutor
+#else
 void BindAsyncExecutor(py::module* m) {
   py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
       .def(py::init([](framework::Scope* scope, const platform::Place& place) {
@@ -49,5 +66,6 @@ void BindAsyncExecutor(py::module* m) {
       }))
       .def("run_from_files", &framework::AsyncExecutor::RunFromFile);
 }  // end BindAsyncExecutor
+#endif
 }  // end namespace pybind
 }  // end namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 017598e1707ffdde1396e7d2bb0bbcbdfa547e71..737ae2dd9c3451a0c9aabd31a5ec05b908356c98 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -960,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                      to fuse elementwise_add_op and activation_op,
                      it may make the execution faster. Default False)DOC")
+      .def_property(
+          "memory_optimize",
+          [](const BuildStrategy &self) { return self.memory_optimize_; },
+          [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
+      .def_property(
+          "memory_early_delete",
+          [](const BuildStrategy &self) { return self.memory_early_delete_; },
+          [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; })
       .def("_finalize_strategy_and_create_passes",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
              return self.CreatePassesFromStrategy(true);
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 48263a1f0580f697a8b4edbcf9fe60e0bb15eba9..2a4428eb705d9b19bf8d1f2970f54a503feef7b0 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -151,7 +151,7 @@ def __bootstrap__():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus'
+            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus'
         ]
 
     core.init_gflags([sys.argv[0]] +
diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py
index 2664a7301db3bf471126ff26504e7042f02b7d84..4ca6a5170eb57b0d799159b7ecc55c2389246041 100644
--- a/python/paddle/fluid/async_executor.py
+++ b/python/paddle/fluid/async_executor.py
@@ -24,6 +24,8 @@ from paddle.fluid.proto import data_feed_pb2
 from google.protobuf import text_format
 from . import io
 from .data_feed_desc import DataFeedDesc
+from .distributed import ps_instance
+from .contrib.utils import hdfs_utils as hdfs
 
 __all__ = ['AsyncExecutor']
 
@@ -74,7 +76,7 @@ class AsyncExecutor(object):
     Note: Only running on CPUPlace supported.
     """
 
-    def __init__(self, place=None):
+    def __init__(self, place=None, run_mode=""):
         if place is None:
             place = core.CPUPlace()
         if not isinstance(place, core.CPUPlace):
@@ -85,8 +87,16 @@ class AsyncExecutor(object):
 
         scope = global_scope()
         self.executor = core.AsyncExecutor(scope, p)
+        self.instance = None
 
-    def run(self, program, data_feed, filelist, thread_num, fetch, debug=False):
+    def run(self,
+            program,
+            data_feed,
+            filelist,
+            thread_num,
+            fetch,
+            mode="",
+            debug=False):
         """
         Run program by this AsyncExecutor. Training dataset will be in filelist.
         Users can also inspect certain variables by naming them in parameter
@@ -106,6 +116,7 @@ class AsyncExecutor(object):
             thread_num(int): number of concurrent training threads. See
                              :code:`Note` for how to set this properly
             fetch(str|list): the var name or a list of var names to inspect
+            mode(str): run mode of this interface
             debug(bool): When set to True, fetch vars will be printed to
                          standard output after each minibatch
 
@@ -148,4 +159,152 @@ class AsyncExecutor(object):
 
         self.executor.run_from_files(program_desc,
                                      data_feed.desc(), filelist, thread_num,
-                                     fetch_var_names, debug)
+                                     fetch_var_names, mode, debug)
+
+    def download_data(self,
+                      afs_path,
+                      local_path,
+                      fs_default_name,
+                      ugi,
+                      file_cnt,
+                      hadoop_home="$HADOOP_HOME",
+                      process_num=12):
+        """
+        download_data is a default download method for distributed training
+        a user download data without this method
+        
+        Example:
+            >>> exe = fluid.AsyncExecutor()
+            >>> exe.download_data("/xxx/xxx/xx/",
+            >>>                   "./data", "afs://            
+            >>>  xxx.xxx.xxx.xxx:9901", "xxx,yyy") 
+        Args:
+            afs_path(str): afs_path defined by users
+            local_path(str): download data path
+            fs_default_name(str): file system server address
+            ugi(str): hadoop ugi
+            file_cn(int): a user can specify file number for debugging
+            hadoop_home(str): hadoop home path
+            process_num(int): download process num
+        """
+        if self.instance is None:
+            raise ValueError('instance is None, please run'
+                             'config_distributed_nodes init instance')
+
+        configs = {"fs.default.name": fs_default_name, "hadoop.job.ugi": ugi}
+
+        client = hdfs.HDFSClient(hadoop_home, configs)
+        downloads = hdfs.multi_download(
+            client,
+            afs_path,
+            local_path,
+            self.instance.get_worker_index(),
+            self.instance.get_node_cnt() / 2,
+            file_cnt,
+            multi_processes=process_num)
+        self.instance.barrier_worker()  #wait for download_data
+
+    def get_instance(self):
+        """
+        get current node's instance so that user can do operations
+        in distributed setting
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        return self.instance
+
+    def config_distributed_nodes(self):
+        """
+        if a user needs to run distributed async executor
+        he or she needs to do a global configuration so that 
+        information of current process can be obtained
+        """
+        self.instance = ps_instance.PaddlePSInstance(1, 2)
+        return self.instance
+
+    def stop(self):
+        """
+        at the end of process, users should call stop to servers
+        and barrier all workers
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        self.instance.barrier_worker()  #worker do all things
+        if self.instance.is_first_worker():
+            self.executor.stop_server()
+        self.instance.barrier_worker()  #sync
+        self.instance.barrier_all()
+        self.instance.finalize()
+
+    def init_server(self, dist_desc):
+        """
+        initialize server of current node if current process is a server
+        Args:
+        dist_desc(str): a protobuf string that describes 
+                        how to init a worker and a server
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        self.executor.init_server(dist_desc, self.instance._rankid)
+        ip = self.executor.start_server()
+        self.instance.set_ip(ip)
+        self.instance.barrier_all()  #wait all server start
+        ips = self.instance.gather_ips()
+        self.executor.gather_servers(ips, self.instance.get_node_cnt())
+        self.instance.barrier_all()  #wait all worker start
+
+    def init_worker(self, dist_desc, startup_program):
+        """
+        initialize worker of current node if current process is a worker
+        Args:
+        dist_desc(str): a protobuf string that describes
+                        how to init a worker and a server
+        startup_program(fluid.Program): startup program of current process
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        place = core.CPUPlace()
+        executor = Executor(place)
+        executor.run(startup_program)
+
+        self.instance.barrier_all()  #wait all server start
+        ips = self.instance.gather_ips()
+        self.executor.init_worker(dist_desc, ips,
+                                  self.instance.get_node_cnt(),
+                                  self.instance._rankid)
+        self.instance.barrier_all()  #wait all worker start
+        if self.instance.is_first_worker():
+            self.executor.init_model()
+        self.instance.barrier_worker()  #wait init model
+
+    def init_model(self):
+        """
+        init_model command that can be invoked from one of the worker
+        model parameters are initialized in servers
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        self.executor.init_model()
+
+    def save_model(self, save_path):
+        """
+        save_model command that can be invoked from one of the worker
+        model parameters are saved in servers and upload to save_path of file system
+        Args:
+        save_path(str): save path to file system
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        self.executor.save_model(save_path)
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py
index 6e479bdc2b93c1189ba07a6f20b2408c34110b93..20b2cc381aaa1b837ce106410246bc8cedb2fc88 100644
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/fluid/contrib/utils/__init__.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 from __future__ import print_function
-from . import lookup_table_utils
-from .lookup_table_utils import *
+#from . import lookup_table_utils
+#from .lookup_table_utils import *
 from . import hdfs_utils
 from .hdfs_utils import *
 
-__all__ = lookup_table_utils.__all__
+#__all__ = lookup_table_utils.__all__
 __all__ = hdfs_utils.__all__
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
index 251665d85e166f4ebf66eced7a5889ee9fc23e08..baea57ccce0e9ca3a8fab244e43a107a89cfe67d 100644
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -32,6 +32,28 @@ _logger.setLevel(logging.INFO)
 
 
 class HDFSClient(object):
+    """
+    A tool of HDFS 
+
+    Args:
+        hadoop_home (string): hadoop_home 
+        configs (dict): hadoop config, it is a dict, please contain \
+            key "fs.default.name" and "hadoop.job.ugi"
+        Can be a float value
+    Examples:
+        hadoop_home = "/home/client/hadoop-client/hadoop/"
+
+        configs = {
+            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+            "hadoop.job.ugi": "hello,hello123"
+        }
+
+        client = HDFSClient(hadoop_home, configs)
+
+        client.ls("/user/com/train-25")
+        files = client.lsr("/user/com/train-25/models")
+    """
+
     def __init__(self, hadoop_home, configs):
         self.pre_commands = []
         hadoop_bin = '%s/bin/hadoop' % hadoop_home
@@ -52,9 +74,13 @@ class HDFSClient(object):
         ret_code = 0
         ret_out = None
         ret_err = None
+        whole_commands = " ".join(whole_commands)
         for x in range(retry_times + 1):
             proc = subprocess.Popen(
-                whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                whole_commands,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                shell=True)
             (output, errors) = proc.communicate()
             ret_code, ret_out, ret_err = proc.returncode, output, errors
             if ret_code:
@@ -68,10 +94,12 @@ class HDFSClient(object):
     def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
         """
             upload the local file to hdfs
-            args:
-                local_file_path: the local file path
-                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
-            return:
+            Args:
+                hdfs_path: hdfs path, target path 
+                local_path: local file path, source path
+                overwrite: will overwrite the original file
+                retry_times: max times retry to upload
+            Returns:
                 True or False
         """
         assert hdfs_path is not None
@@ -114,10 +142,12 @@ class HDFSClient(object):
     def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
         """
             download from hdfs
-            args:
-                local_file_path: the local file path
-                remote_file_path: remote dir on hdfs
-            return:
+            Args:
+                hdfs_path: hdfs path, target path 
+                local_path: local file path, source path
+                overwrite: will remove original file and overwrite it.
+                unzip: ignore this param
+            Returns
                 True or False
         """
         _logger.info('Downloading %r to %r.', hdfs_path, local_path)
@@ -159,11 +189,11 @@ class HDFSClient(object):
     def is_exist(self, hdfs_path=None):
         """
             whether the remote hdfs path exists?
-            args:
-                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+            Args:
+                hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
                 fs_name: The default values are the same as in the job configuration
                 fs_ugi: The default values are the same as in the job configuration
-            return:
+            Returns:
                 True or False
         """
         exist_cmd = ['-test', '-e', hdfs_path]
@@ -182,11 +212,11 @@ class HDFSClient(object):
     def is_dir(self, hdfs_path=None):
         """
             whether the remote hdfs path exists?
-            args:
+            Args:
                 remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
                 fs_name: The default values are the same as in the job configuration
                 fs_ugi: The default values are the same as in the job configuration
-            return:
+            Returns:
                 True or False
         """
 
@@ -206,15 +236,17 @@ class HDFSClient(object):
             return True
 
     def delete(self, hdfs_path):
-        """Remove a file or directory from HDFS.
-
-        :param hdfs_path: HDFS path.
-        :param recursive: Recursively delete files and directories. By default,
-          this method will raise an :class:`HdfsError` if trying to delete a
-          non-empty directory.
+        """
+            Remove a file or directory from HDFS.
 
-        This function returns `True` if the deletion was successful and `False` if
-        no file or directory previously existed at `hdfs_path`.
+        Args:
+            param hdfs_path: HDFS path.
+            param recursive: Recursively delete files and directories. By default,
+            this method will raise an :class:`HdfsError` if trying to delete a
+            non-empty directory.
+        Returns:
+            This function returns `True` if the deletion was successful and `False` if
+            no file or directory previously existed at `hdfs_path`.
 
         """
         _logger.info('Deleting %r.', hdfs_path)
@@ -240,14 +272,17 @@ class HDFSClient(object):
             return True
 
     def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """Move a file or folder.
-
-        :param hdfs_src_path: Source path.
-        :param hdfs_dst_path: Destination path. If the path already exists and is
-          a directory, the source will be moved into it. If the path exists and is
-          a file, or if a parent destination directory is missing, this method will
-          raise an :class:`HdfsError`.
-
+        """
+        Rename a file or folder.
+        Args:    
+            :param hdfs_src_path: Source path.
+            :param hdfs_dst_path: Destination path. If the path already exists and is
+              a directory, the source will be moved into it. If the path exists and is
+              a file, or if a parent destination directory is missing, this method will
+              raise an :class:`HdfsError`.
+        Returns:
+             This function returns `True` if the rename was successful and `False` if
+             rename was faild.       
         """
         assert hdfs_src_path is not None
         assert hdfs_dst_path is not None
@@ -273,6 +308,11 @@ class HDFSClient(object):
 
     @staticmethod
     def make_local_dirs(local_path):
+        """
+        create a directiory local, is same to mkdir
+        Args:
+            local_path: local path that wants to create a directiory.
+        """
         try:
             os.makedirs(local_path)
         except OSError as e:
@@ -281,9 +321,11 @@ class HDFSClient(object):
 
     def makedirs(self, hdfs_path):
         """Create a remote directory, recursively if necessary.
-
-        :param hdfs_path: Remote path. Intermediate directories will be created
-          appropriately.
+        Args:
+            :param hdfs_path: Remote path. Intermediate directories will be created
+              appropriately.
+        Returns:
+            True if make a directories was successful, False when make a directiries was failed. 
         """
         _logger.info('Creating directories to %r.', hdfs_path)
         assert hdfs_path is not None
@@ -303,6 +345,13 @@ class HDFSClient(object):
             return True
 
     def ls(self, hdfs_path):
+        """
+        ls a hdfs_path.
+        Args:    
+            :param hdfs_path: hdfs_path will be ls.
+        Returns:
+             This function returns a `list` that contaion all files in the hdfs_path.        
+        """
         assert hdfs_path is not None
 
         if not self.is_exist(hdfs_path):
@@ -328,6 +377,14 @@ class HDFSClient(object):
             return ret_lines
 
     def lsr(self, hdfs_path, only_file=True, sort=True):
+        """
+        ls a hdfs_path sort by time.
+        Args:    
+            :param hdfs_path: hdfs_path will be ls.
+        Returns:
+             This function returns a `list` that contaion all files sorted by time in the hdfs_path.        
+        """
+
         def sort_by_time(v1, v2):
             v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
             v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
@@ -371,12 +428,15 @@ def multi_upload(client,
                  multi_processes=5,
                  overwrite=False):
     """
-    :param overwrite: will overwrite hdfs file or not
-    :param multi_processes: the upload data process at the same time, default=5
-    :param client: instance of HDFSClient
-    :param hdfs_path: path on hdfs
-    :param local_path: path on local
-    :return:
+    Upload file to hdfs.
+    Args:
+        :param overwrite: will overwrite hdfs file or not
+        :param multi_processes: the upload data process at the same time, default=5
+        :param client: instance of HDFSClient
+        :param hdfs_path: path on hdfs
+        :param local_path: path on local
+    Returns:
+        
     """
 
     def __subprocess_upload(datas):
@@ -386,6 +446,13 @@ def multi_upload(client,
             client.upload(hdfs_re_path, data, overwrite, retry_times=5)
 
     def get_local_files(path):
+        """
+            Get all local files
+        Args:
+            path: local file path
+        Returns:
+            A list that contation all files in the path.
+        """
         rlist = []
 
         if not os.path.isdir(path):
@@ -426,16 +493,21 @@ def multi_download(client,
                    local_path,
                    trainer_id,
                    trainers,
+                   file_cnt,
                    multi_processes=5):
     """
     multi_download
-    :param client: instance of HDFSClient
-    :param hdfs_path: path on hdfs
-    :param local_path: path on local
-    :param trainer_id: current trainer id
-    :param trainers: all trainers number
-    :param multi_processes: the download data process at the same time, default=5
-    :return: None
+    Args:
+        :param client: instance of HDFSClient
+        :param hdfs_path: path on hdfs
+        :param local_path: path on local
+        :param trainer_id: current trainer id
+        :param trainers: all trainers number
+        :param file_cnt: all file number
+        :param multi_processes: the download data process at the same time, default=5
+        :return: None
+    Returns:
+        A list that be downloaded. 
     """
 
     def __subprocess_download(datas):
@@ -449,7 +521,7 @@ def multi_download(client,
     client.make_local_dirs(local_path)
     _logger.info("Make local dir {} successfully".format(local_path))
 
-    all_need_download = client.lsr(hdfs_path, sort=True)
+    all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt]
     need_download = all_need_download[trainer_id::trainers]
     _logger.info("Get {} files From all {} files need to be download from {}".
                  format(len(need_download), len(all_need_download), hdfs_path))
@@ -500,6 +572,7 @@ if __name__ == "__main__":
         "/home/xx/data1",
         1,
         5,
+        100,
         multi_processes=5)
 
     multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py
index 52d9ce75f8d73eb3c3e8683bc0793e9dd8fbe48d..74824f6832442d5090e0cea2962ca2f68b7a0181 100644
--- a/python/paddle/fluid/distribute_lookup_table.py
+++ b/python/paddle/fluid/distribute_lookup_table.py
@@ -15,12 +15,52 @@
 LOOKUP_TABLE_TYPE = "lookup_table"
 
 
+def find_distributed_lookup_table_inputs(program, table_name):
+    """
+    Find input variable of distribute lookup table in program.
+    We only support one distribute table now.
+    Args:
+    program(Program): given program, locate distributed lookup table
+    table_name(str): given table name that is found beforehand
+    Returns:
+    inputs
+    """
+    local_vars = program.current_block().vars
+    inputs = []
+    for op in program.global_block().ops:
+        if op.type == LOOKUP_TABLE_TYPE:
+            if table_name == op.input("W")[0]:
+                inputs.extend([local_vars[name] for name in op.input("Ids")])
+    return inputs
+
+
+def find_distributed_lookup_table_outputs(program, table_name):
+    """
+    Find output variable of distribute lookup table in program.
+    We only support one distribute table now.
+    Args:
+    program(Program): given program, locate distributed lookup table
+    table_name(str): given table name that is found beforehand
+    Returns:
+    outputs
+    """
+    local_vars = program.current_block().vars
+    outputs = []
+    for op in program.global_block().ops:
+        if op.type == LOOKUP_TABLE_TYPE:
+            if table_name == op.input("W")[0]:
+                outputs.extend([local_vars[name] for name in op.output("Out")])
+    return outputs
+
+
 def find_distributed_lookup_table(program):
     """
     Find distribute lookup table in program.
     We only support one distribute table now.
-    :param program:
-    :return: table_name or None
+    Args:
+    program(Program): given program, locate distributed lookup table
+    Returns:
+    table_name or None
     """
     table_name = None
 
diff --git a/python/paddle/fluid/distributed/__init__.py b/python/paddle/fluid/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd609c504078b907221a689fbb4e910ec8d54270
--- /dev/null
+++ b/python/paddle/fluid/distributed/__init__.py
@@ -0,0 +1,12 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
new file mode 100644
index 0000000000000000000000000000000000000000..87dfab92c53d9950d4606e078cc9f51bcda8f4d3
--- /dev/null
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from .node import DownpourServer
+from .node import DownpourWorker
+from ..backward import append_backward
+import ps_pb2 as pslib
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs
+from google.protobuf import text_format
+
+
+class DownpourSGD(object):
+    """
+    Distributed optimizer of downpour stochastic gradient descent
+    Standard implementation of Google's Downpour SGD
+    in Large Scale Distributed Deep Networks
+
+    Args:
+        learning_rate (float): the learning rate used to update parameters. \
+        Can be a float value
+    Examples:
+        .. code-block:: python
+    
+             downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2)
+             downpour_sgd.minimize(cost)
+    """
+
+    def __init__(self, learning_rate=0.001, window=1):
+        # todo(guru4elephant): add more optimizers here as argument
+        # todo(guru4elephant): make learning_rate as a variable
+        self.learning_rate_ = learning_rate
+        self.window_ = window
+        self.type = "downpour"
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """
+        DownpounSGD is a distributed optimizer so
+        that user can call minimize to generate backward
+        operators and optimization operators within minmize function
+        Args:
+            loss(Variable): loss variable defined by user
+            startup_program(Program): startup program that defined by user
+            parameter_list(str list): parameter names defined by users
+            no_grad_set(set): a set of variables that is defined by users
+            so that these variables do not need gradient computation
+        Returns:
+            [ps_param, worker_skipped_ops]
+            ps_param: parameter server protobuf desc
+            worker_skipped_ops: operator names that need
+            to be skipped during execution
+        """
+        params_grads = sorted(
+            append_backward(loss, parameter_list, no_grad_set),
+            key=lambda x: x[0].name)
+        table_name = find_distributed_lookup_table(loss.block.program)
+        prefetch_slots = find_distributed_lookup_table_inputs(
+            loss.block.program, table_name)
+        prefetch_slots_emb = find_distributed_lookup_table_outputs(
+            loss.block.program, table_name)
+        server = DownpourServer()
+        # window is communication strategy
+        worker = DownpourWorker(self.window_)
+        # Todo(guru4elephant): support multiple tables definitions
+        # currently support one big sparse table
+        sparse_table_index = 0
+        # currently merge all dense parameters into one dense table
+        dense_table_index = 1
+        params = []
+        grads = []
+        for i in params_grads:
+            params.append(i[0])
+        for i in params_grads:
+            grads.append(i[1])
+        server.add_sparse_table(sparse_table_index, self.learning_rate_,
+                                prefetch_slots, prefetch_slots_emb)
+        server.add_dense_table(dense_table_index, self.learning_rate_, params,
+                               grads)
+        worker.add_sparse_table(sparse_table_index, self.learning_rate_,
+                                prefetch_slots, prefetch_slots_emb)
+        worker.add_dense_table(dense_table_index, self.learning_rate_, params,
+                               grads)
+        ps_param = pslib.PSParameter()
+        ps_param.server_param.CopyFrom(server.get_desc())
+        ps_param.trainer_param.CopyFrom(worker.get_desc())
+        # Todo(guru4elephant): figure out how to support more sparse parameters
+        # currently only support lookup_table
+        worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
+        ps_param.trainer_param.skip_op.extend(worker_skipped_ops)
+        return [ps_param, worker_skipped_ops]
diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d3d0315cf2932847b79ea799fc592692383287
--- /dev/null
+++ b/python/paddle/fluid/distributed/helper.py
@@ -0,0 +1,85 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class FileSystem(object):
+    """
+    A file system that support async_executor hadoop client desc. 
+
+    Args:
+        fs_type (string): fs_type, for example is "afs"
+        user (string): hadoop param
+        passwd (string): hadoop param
+        hadoop bin (string): hadoop param
+    Examples:
+        fs = FileSystm()
+    """
+
+    def __init__(self,
+                 fs_type="afs",
+                 uri="afs://xx",
+                 user=None,
+                 passwd=None,
+                 hadoop_bin=""):
+        assert user != None
+        assert passwd != None
+        assert hadoop_bin != None
+        import ps_pb2 as pslib
+        self.fs_client = pslib.FsClientParameter()
+        self.fs_client.uri = uri
+        self.fs_client.user = user
+        self.fs_client.passwd = passwd
+        #self.fs_client.buffer_size = 0
+        self.fs_client.hadoop_bin = hadoop_bin
+        #self.fs_client.afs_conf = afs_conf if not afs_conf else ""
+
+    def get_desc(self):
+        """
+        get hadoop desc.
+        """
+        return self.fs_client
+
+
+class MPIHelper(object):
+    """
+    MPIHelper is a wrapper of mpi4py, support get_rank get_size etc.
+    Args:
+        No params
+    Examples:
+        mh = MPIHelper()
+        mh.get_ip()
+    """
+
+    def __init__(self):
+        from mpi4py import MPI
+        self.comm = MPI.COMM_WORLD
+        self.MPI = MPI
+
+    def get_rank(self):
+        return self.comm.Get_rank()
+
+    def get_size(self):
+        return self.comm.Get_size()
+
+    def get_ip(self):
+        import socket
+        local_ip = socket.gethostbyname(socket.gethostname())
+        return local_ip
+
+    def get_hostname(self):
+        import socket
+        return socket.gethostname()
+
+    def finalize(self):
+        self.MPI.Finalize()
diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py
new file mode 100644
index 0000000000000000000000000000000000000000..41e0d64e0b788b0e354f7635c3d3e52d6bba7e23
--- /dev/null
+++ b/python/paddle/fluid/distributed/node.py
@@ -0,0 +1,179 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import ps_pb2 as pslib
+
+
+class Server(object):
+    """
+        A Server basic class.
+    """
+
+    def __init__(self):
+        pass
+
+
+class Worker(object):
+    """
+        A Worker basic class.
+    """
+
+    def __init__(self):
+        pass
+
+
+class DownpourServer(Server):
+    """
+        DownpourServer class is used to generate server program_desc
+        Args:
+            server: it is pslib.ServerParameter() 
+        Examples:
+            server = DownpourServer()
+    """
+
+    def __init__(self):
+        self.server_ = pslib.ServerParameter()
+        self.server_.downpour_server_param.service_param.start_server_port = 0
+        self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
+        self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
+        self.server_.downpour_server_param.service_param.service_class = "DownpourPsService"
+        self.server_.downpour_server_param.service_param.start_server_port = 0
+        self.server_.downpour_server_param.service_param.server_thread_num = 12
+
+    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
+                         slot_value_var):
+        """
+        Args:
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            slot_key_vars(string): slot key id 
+            slot_value_var(string): slot key value after embedding
+        Returns:
+            return None 
+        """
+        table = self.server_.downpour_server_param.downpour_table_param.add()
+        table.table_id = table_id
+        table.table_class = "DownpourSparseTable"
+        table.type = pslib.PS_SPARSE_TABLE
+        table.accessor.accessor_class = "DownpourFeatureValueAccessor"
+        table.accessor.sparse_sgd_param.learning_rate = learning_rate
+        table.accessor.sparse_sgd_param.initial_g2sum = 3
+        table.accessor.sparse_sgd_param.initial_range = 1e-4
+        table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10])
+
+        table.accessor.embedx_dim = 8
+        table.accessor.embedx_threshold = 5
+        table.accessor.fea_dim = 11
+        table.accessor.downpour_accessor_param.nonclk_coeff = 0.1
+        table.accessor.downpour_accessor_param.click_coeff = 2
+        table.accessor.downpour_accessor_param.base_threshold = 0.2
+        table.accessor.downpour_accessor_param.delta_threshold = 0.15
+        table.accessor.downpour_accessor_param.delta_keep_days = 31
+        table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999
+        table.accessor.downpour_accessor_param.delete_threshold = 0.8
+
+    def add_dense_table(self, table_id, learning_rate, param_var, grad_var):
+        """
+        Args:
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            param_var(list): all dense param. it is a list.
+            grad_var(list): all dense grad parm it is a list.
+        Returns:
+            return None 
+        """
+        table = self.server_.downpour_server_param.downpour_table_param.add()
+        table.table_id = table_id
+        table.table_class = "DownpourDenseTable"
+        table.type = pslib.PS_DENSE_TABLE
+        table.accessor.accessor_class = "DownpourDenseValueAccessor"
+        table.accessor.dense_sgd_param.name = "adam"
+        table.accessor.dense_sgd_param.adam.learning_rate = learning_rate
+        table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993
+        table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999
+        table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8
+        table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99
+        table.accessor.dense_sgd_param.naive.learning_rate = 0.0002
+        fea_dim = 0
+        for param in filter(lambda x: x.name.find("embedding") == -1,
+                            param_var):
+            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
+        table.accessor.fea_dim = fea_dim
+
+    def get_desc(self):
+        """
+        Return downpour server program_desc
+        """
+        return self.server_
+
+
+class DownpourWorker(Worker):
+    """
+        DownpourWorker class is used to generate worker program_desc
+        Args:
+            window (int): push params frequency
+            worker: it is pslib.DownpourTrainerParameter 
+        Examples:
+            worker = DownpourWorker(1)
+    """
+
+    def __init__(self, window):
+        self.window = window
+        self.worker_ = pslib.DownpourTrainerParameter()
+
+    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
+                         slot_value_vars):
+        """
+        Args:
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            slot_key_vars(string): slot key id 
+            slot_value_var(string): slot key value after embedding
+        Returns:
+            return None 
+        """
+        table = self.worker_.sparse_table.add()
+        table.table_id = table_id
+        table.slot_key.extend([var.name for var in slot_key_vars])
+        table.slot_value.extend([var.name for var in slot_value_vars])
+        table.slot_gradient.extend(
+            [var.name + "@GRAD" for var in slot_value_vars])
+
+    def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars):
+        """
+        Args:
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            param_var(list): all dense param. it is a list.
+            grad_var(list): all dense grad parm it is a list.
+        Returns:
+            return None 
+        """
+        table = self.worker_.dense_table.add()
+        table.table_id = table_id
+        table.dense_variable_name.extend(
+            filter(lambda x: x.find("embedding") == -1,
+                   [p.name for p in param_vars]))
+        table.dense_gradient_variable_name.extend(
+            filter(lambda x: x.find("embedding") == -1,
+                   [g.name for g in grad_vars]))
+
+    def get_desc(self):
+        """
+        Return downpour worker program_desc
+        """
+        return self.worker_
diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ce3ce6934d08eb06763fea071a83e460c6bf6c
--- /dev/null
+++ b/python/paddle/fluid/distributed/ps_instance.py
@@ -0,0 +1,148 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from .helper import MPIHelper
+
+
+class PaddlePSInstance(object):
+    """
+        PaddlePSInstance class is used to generate A instance of server or worker 
+        Args:
+            server_worker_mode: is a value 0 or 1, default is 1
+            proc_per_node: process per node, default is 2 
+        Examples:
+            instance = PaddlePSInstance(1, 2)
+    """
+
+    def __init__(self, server_worker_mode, proc_per_node):
+        self.dh = MPIHelper()
+        self._rankid = self.dh.get_rank()
+        self._server_worker_mode = server_worker_mode
+        self._proc_per_node = proc_per_node
+        self._nodes = self.dh.get_size()
+
+        self._ip = 0
+        self._worker_num = self._nodes * self._proc_per_node / 2
+        self._server_num = self._nodes * self._proc_per_node / 2
+        self._total_server_worker = self._worker_num + self._server_num
+        self._node_type = None  #IDLE=-1, WORKER=1, SERVER=0
+        self._set_nodetype()
+        self._comm = None
+        self._split_comm()
+
+    def _set_nodetype(self):
+        if self._server_worker_mode == 0:
+            if self._rankid < self._server_num:
+                self._node_type = 1
+            elif self._rankid < self._total_server_worker:
+                self._node_type = 0
+            else:
+                self._node_type = -1
+        elif self._server_worker_mode == 1:
+            if self._rankid < self._total_server_worker:
+                if 0 == self._rankid % self._proc_per_node % 2:
+                    self._node_type = 0
+                else:
+                    self._node_type = 1
+            else:
+                self._node_type = -1
+        else:
+            self._node_type = -1
+
+    def _split_comm(self):
+        if self.is_server():
+            self._comm = self.dh.comm.Split(self._node_type)
+        elif self.is_worker():
+            self._comm = self.dh.comm.Split(self._node_type)
+        pass
+
+    def get_worker_index(self):
+        """
+        Return worker index 
+        """
+        if self._server_worker_mode == 0:
+            return self._rankid == self.server_num
+        else:
+            return self._rankid / self._proc_per_node
+
+    def get_server_index(self):
+        """
+        Return server index 
+        """
+        if self._server_worker_mode == 0:
+            return self.rank_id
+        else:
+            return self.rank_id / self._proc_per_node
+
+    def is_worker(self):
+        """
+        Return instance is worker or not
+        """
+        return self._node_type == 1
+
+    def is_server(self):
+        """
+        Return instance is server or not
+        """
+        return self._node_type == 0
+
+    def is_first_worker(self):
+        """
+        Return instance is first worker or not
+        """
+        return self.is_worker() and 0 == self.get_worker_index()
+
+    def set_ip(self, ip):
+        """
+            set server ip
+        """
+        self._ip = ip
+
+    def gather_ips(self):
+        """
+        Return all servers and workers ip throught mpi allgather 
+        """
+        self._ips = self.dh.comm.allgather(self._ip)
+        return self._ips
+
+    def get_node_cnt(self):
+        """
+        Return node cnt
+        """
+        return self._nodes
+
+    def barrier_all(self):
+        """
+        barrier workers and servers
+        """
+        self.dh.comm.barrier()
+
+    def barrier_worker(self):
+        """
+        barrier workers
+        """
+        if self.is_worker():
+            self._comm.barrier()
+        pass
+
+    def finalize(self):
+        """
+        MPI finalize
+        """
+        self.dh.finalize()
+        pass
+
+
+if __name__ == "__main__":
+    instance = PaddlePSInstance(1, 1, 2, 50)
+    instance.barrier_all()
diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d226c4d593473681658fa3e7764d438a65b7116
--- /dev/null
+++ b/python/paddle/fluid/distributed/ps_pb2.py
@@ -0,0 +1,2296 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: ps.proto
+
+import sys
+_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+    name='ps.proto',
+    package='paddle',
+    syntax='proto2',
+    serialized_pb=_b(
+        '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01'
+    ))
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+_TABLETYPE = _descriptor.EnumDescriptor(
+    name='TableType',
+    full_name='paddle.TableType',
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None),
+    ],
+    containing_type=None,
+    options=None,
+    serialized_start=3286,
+    serialized_end=3338, )
+_sym_db.RegisterEnumDescriptor(_TABLETYPE)
+
+TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
+_PSCMDID = _descriptor.EnumDescriptor(
+    name='PsCmdID',
+    full_name='paddle.PsCmdID',
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name='PS_PULL_DENSE_TABLE',
+            index=0,
+            number=0,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_PUSH_DENSE_TABLE',
+            index=1,
+            number=1,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_PULL_SPARSE_TABLE',
+            index=2,
+            number=2,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_PUSH_SPARSE_TABLE',
+            index=3,
+            number=3,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_SAVE_ONE_TABLE',
+            index=5,
+            number=5,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_SAVE_ALL_TABLE',
+            index=6,
+            number=6,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_LOAD_ONE_TABLE',
+            index=7,
+            number=7,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_LOAD_ALL_TABLE',
+            index=8,
+            number=8,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_CLEAR_ONE_TABLE',
+            index=9,
+            number=9,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_CLEAR_ALL_TABLE',
+            index=10,
+            number=10,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_PUSH_DENSE_PARAM',
+            index=11,
+            number=11,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_STOP_SERVER', index=12, number=12, options=None,
+            type=None),
+    ],
+    containing_type=None,
+    options=None,
+    serialized_start=3341,
+    serialized_end=3658, )
+_sym_db.RegisterEnumDescriptor(_PSCMDID)
+
+PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
+PS_SPARSE_TABLE = 0
+PS_DENSE_TABLE = 1
+PS_PULL_DENSE_TABLE = 0
+PS_PUSH_DENSE_TABLE = 1
+PS_PULL_SPARSE_TABLE = 2
+PS_PUSH_SPARSE_TABLE = 3
+PS_SHRINK_TABLE = 4
+PS_SAVE_ONE_TABLE = 5
+PS_SAVE_ALL_TABLE = 6
+PS_LOAD_ONE_TABLE = 7
+PS_LOAD_ALL_TABLE = 8
+PS_CLEAR_ONE_TABLE = 9
+PS_CLEAR_ALL_TABLE = 10
+PS_PUSH_DENSE_PARAM = 11
+PS_STOP_SERVER = 12
+
+_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor(
+    name='FsApiType',
+    full_name='paddle.FsClientParameter.FsApiType',
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name='HDFS', index=0, number=0, options=None, type=None),
+        _descriptor.EnumValueDescriptor(
+            name='AFS', index=1, number=1, options=None, type=None),
+    ],
+    containing_type=None,
+    options=None,
+    serialized_start=3254,
+    serialized_end=3284, )
+_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
+
+_PSPARAMETER = _descriptor.Descriptor(
+    name='PSParameter',
+    full_name='paddle.PSParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='worker_class',
+            full_name='paddle.PSParameter.worker_class',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='server_class',
+            full_name='paddle.PSParameter.server_class',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='instance_class',
+            full_name='paddle.PSParameter.instance_class',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='worker_param',
+            full_name='paddle.PSParameter.worker_param',
+            index=3,
+            number=101,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='server_param',
+            full_name='paddle.PSParameter.server_param',
+            index=4,
+            number=102,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='trainer_param',
+            full_name='paddle.PSParameter.trainer_param',
+            index=5,
+            number=301,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='fs_client_param',
+            full_name='paddle.PSParameter.fs_client_param',
+            index=6,
+            number=501,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=21,
+    serialized_end=307, )
+
+_WORKERPARAMETER = _descriptor.Descriptor(
+    name='WorkerParameter',
+    full_name='paddle.WorkerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='downpour_worker_param',
+            full_name='paddle.WorkerParameter.downpour_worker_param',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=309,
+    serialized_end=390, )
+
+_SERVERPARAMETER = _descriptor.Descriptor(
+    name='ServerParameter',
+    full_name='paddle.ServerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='downpour_server_param',
+            full_name='paddle.ServerParameter.downpour_server_param',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=392,
+    serialized_end=473, )
+
+_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor(
+    name='DownpourWorkerParameter',
+    full_name='paddle.DownpourWorkerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='downpour_table_param',
+            full_name='paddle.DownpourWorkerParameter.downpour_table_param',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=475,
+    serialized_end=554, )
+
+_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor(
+    name='DownpourTrainerParameter',
+    full_name='paddle.DownpourTrainerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='dense_table',
+            full_name='paddle.DownpourTrainerParameter.dense_table',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='sparse_table',
+            full_name='paddle.DownpourTrainerParameter.sparse_table',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='push_sparse_per_batch',
+            full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch',
+            index=2,
+            number=3,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='push_dense_per_batch',
+            full_name='paddle.DownpourTrainerParameter.push_dense_per_batch',
+            index=3,
+            number=4,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='skip_op',
+            full_name='paddle.DownpourTrainerParameter.skip_op',
+            index=4,
+            number=5,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=557,
+    serialized_end=763, )
+
+_DENSETABLEPARAMETER = _descriptor.Descriptor(
+    name='DenseTableParameter',
+    full_name='paddle.DenseTableParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='table_id',
+            full_name='paddle.DenseTableParameter.table_id',
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='dense_variable_name',
+            full_name='paddle.DenseTableParameter.dense_variable_name',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='dense_gradient_variable_name',
+            full_name='paddle.DenseTableParameter.dense_gradient_variable_name',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='fea_dim',
+            full_name='paddle.DenseTableParameter.fea_dim',
+            index=3,
+            number=4,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=765,
+    serialized_end=888, )
+
+_SPARSETABLEPARAMETER = _descriptor.Descriptor(
+    name='SparseTableParameter',
+    full_name='paddle.SparseTableParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='table_id',
+            full_name='paddle.SparseTableParameter.table_id',
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='feature_dim',
+            full_name='paddle.SparseTableParameter.feature_dim',
+            index=1,
+            number=2,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='slot_key',
+            full_name='paddle.SparseTableParameter.slot_key',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='slot_value',
+            full_name='paddle.SparseTableParameter.slot_value',
+            index=3,
+            number=4,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='slot_gradient',
+            full_name='paddle.SparseTableParameter.slot_gradient',
+            index=4,
+            number=5,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=890,
+    serialized_end=1012, )
+
+_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor(
+    name='DownpourServerParameter',
+    full_name='paddle.DownpourServerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='downpour_table_param',
+            full_name='paddle.DownpourServerParameter.downpour_table_param',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='service_param',
+            full_name='paddle.DownpourServerParameter.service_param',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1015,
+    serialized_end=1149, )
+
+_SERVERSERVICEPARAMETER = _descriptor.Descriptor(
+    name='ServerServiceParameter',
+    full_name='paddle.ServerServiceParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='server_class',
+            full_name='paddle.ServerServiceParameter.server_class',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=_b("DownpourBrpcPsServer").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='client_class',
+            full_name='paddle.ServerServiceParameter.client_class',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=_b("DownpourBrpcPsClient").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='service_class',
+            full_name='paddle.ServerServiceParameter.service_class',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=_b("DownpourPsService").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='start_server_port',
+            full_name='paddle.ServerServiceParameter.start_server_port',
+            index=3,
+            number=4,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='server_thread_num',
+            full_name='paddle.ServerServiceParameter.server_thread_num',
+            index=4,
+            number=5,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=True,
+            default_value=12,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1152,
+    serialized_end=1367, )
+
+_TABLEPARAMETER = _descriptor.Descriptor(
+    name='TableParameter',
+    full_name='paddle.TableParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='table_id',
+            full_name='paddle.TableParameter.table_id',
+            index=0,
+            number=1,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='table_class',
+            full_name='paddle.TableParameter.table_class',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='shared_num',
+            full_name='paddle.TableParameter.shared_num',
+            index=2,
+            number=3,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='accessor',
+            full_name='paddle.TableParameter.accessor',
+            index=3,
+            number=4,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='type',
+            full_name='paddle.TableParameter.type',
+            index=4,
+            number=5,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='compress_in_save',
+            full_name='paddle.TableParameter.compress_in_save',
+            index=5,
+            number=6,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1370,
+    serialized_end=1561, )
+
+_TABLEACCESSORPARAMETER = _descriptor.Descriptor(
+    name='TableAccessorParameter',
+    full_name='paddle.TableAccessorParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='accessor_class',
+            full_name='paddle.TableAccessorParameter.accessor_class',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='sparse_sgd_param',
+            full_name='paddle.TableAccessorParameter.sparse_sgd_param',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='dense_sgd_param',
+            full_name='paddle.TableAccessorParameter.dense_sgd_param',
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='fea_dim',
+            full_name='paddle.TableAccessorParameter.fea_dim',
+            index=3,
+            number=4,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='embedx_dim',
+            full_name='paddle.TableAccessorParameter.embedx_dim',
+            index=4,
+            number=5,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='embedx_threshold',
+            full_name='paddle.TableAccessorParameter.embedx_threshold',
+            index=5,
+            number=6,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='downpour_accessor_param',
+            full_name='paddle.TableAccessorParameter.downpour_accessor_param',
+            index=6,
+            number=7,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='table_accessor_save_param',
+            full_name='paddle.TableAccessorParameter.table_accessor_save_param',
+            index=7,
+            number=8,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1564,
+    serialized_end=1933, )
+
+_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
+    name='DownpourTableAccessorParameter',
+    full_name='paddle.DownpourTableAccessorParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='nonclk_coeff',
+            full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff',
+            index=0,
+            number=1,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='click_coeff',
+            full_name='paddle.DownpourTableAccessorParameter.click_coeff',
+            index=1,
+            number=2,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='base_threshold',
+            full_name='paddle.DownpourTableAccessorParameter.base_threshold',
+            index=2,
+            number=3,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='delta_threshold',
+            full_name='paddle.DownpourTableAccessorParameter.delta_threshold',
+            index=3,
+            number=4,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='delta_keep_days',
+            full_name='paddle.DownpourTableAccessorParameter.delta_keep_days',
+            index=4,
+            number=5,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='show_click_decay_rate',
+            full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate',
+            index=5,
+            number=6,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='delete_threshold',
+            full_name='paddle.DownpourTableAccessorParameter.delete_threshold',
+            index=6,
+            number=7,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1936,
+    serialized_end=2142, )
+
+_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
+    name='TableAccessorSaveParameter',
+    full_name='paddle.TableAccessorSaveParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='param',
+            full_name='paddle.TableAccessorSaveParameter.param',
+            index=0,
+            number=1,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='converter',
+            full_name='paddle.TableAccessorSaveParameter.converter',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='deconverter',
+            full_name='paddle.TableAccessorSaveParameter.deconverter',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2144,
+    serialized_end=2227, )
+
+_PSREQUESTMESSAGE = _descriptor.Descriptor(
+    name='PsRequestMessage',
+    full_name='paddle.PsRequestMessage',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='cmd_id',
+            full_name='paddle.PsRequestMessage.cmd_id',
+            index=0,
+            number=1,
+            type=13,
+            cpp_type=3,
+            label=2,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='table_id',
+            full_name='paddle.PsRequestMessage.table_id',
+            index=1,
+            number=2,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='params',
+            full_name='paddle.PsRequestMessage.params',
+            index=2,
+            number=3,
+            type=12,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='client_id',
+            full_name='paddle.PsRequestMessage.client_id',
+            index=3,
+            number=4,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='data',
+            full_name='paddle.PsRequestMessage.data',
+            index=4,
+            number=5,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b(""),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2229,
+    serialized_end=2330, )
+
+_SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
+    name='SparseSGDRuleParameter',
+    full_name='paddle.SparseSGDRuleParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='learning_rate',
+            full_name='paddle.SparseSGDRuleParameter.learning_rate',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='initial_g2sum',
+            full_name='paddle.SparseSGDRuleParameter.initial_g2sum',
+            index=1,
+            number=2,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='initial_range',
+            full_name='paddle.SparseSGDRuleParameter.initial_range',
+            index=2,
+            number=3,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=True,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='weight_bounds',
+            full_name='paddle.SparseSGDRuleParameter.weight_bounds',
+            index=3,
+            number=4,
+            type=2,
+            cpp_type=6,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2332,
+    serialized_end=2451, )
+
+_DENSESGDRULEPARAMETER = _descriptor.Descriptor(
+    name='DenseSGDRuleParameter',
+    full_name='paddle.DenseSGDRuleParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='name',
+            full_name='paddle.DenseSGDRuleParameter.name',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='adam',
+            full_name='paddle.DenseSGDRuleParameter.adam',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='naive',
+            full_name='paddle.DenseSGDRuleParameter.naive',
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='summary',
+            full_name='paddle.DenseSGDRuleParameter.summary',
+            index=3,
+            number=4,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='moving_average',
+            full_name='paddle.DenseSGDRuleParameter.moving_average',
+            index=4,
+            number=5,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2454,
+    serialized_end=2679, )
+
+_ADAMSGDPARAMETER = _descriptor.Descriptor(
+    name='AdamSGDParameter',
+    full_name='paddle.AdamSGDParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='learning_rate',
+            full_name='paddle.AdamSGDParameter.learning_rate',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='avg_decay_rate',
+            full_name='paddle.AdamSGDParameter.avg_decay_rate',
+            index=1,
+            number=2,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='ada_decay_rate',
+            full_name='paddle.AdamSGDParameter.ada_decay_rate',
+            index=2,
+            number=3,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='ada_epsilon',
+            full_name='paddle.AdamSGDParameter.ada_epsilon',
+            index=3,
+            number=4,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='mom_decay_rate',
+            full_name='paddle.AdamSGDParameter.mom_decay_rate',
+            index=4,
+            number=5,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2682,
+    serialized_end=2816, )
+
+_NAIVESGDPARAMETER = _descriptor.Descriptor(
+    name='NaiveSGDParameter',
+    full_name='paddle.NaiveSGDParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='learning_rate',
+            full_name='paddle.NaiveSGDParameter.learning_rate',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='avg_decay_rate',
+            full_name='paddle.NaiveSGDParameter.avg_decay_rate',
+            index=1,
+            number=2,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2818,
+    serialized_end=2884, )
+
+_SUMMARYSGDPARAMETER = _descriptor.Descriptor(
+    name='SummarySGDParameter',
+    full_name='paddle.SummarySGDParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='summary_decay_rate',
+            full_name='paddle.SummarySGDParameter.summary_decay_rate',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=True,
+            default_value=float(0.999999),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2886,
+    serialized_end=2945, )
+
+_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
+    name='MovingAverageRuleParameter',
+    full_name='paddle.MovingAverageRuleParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='momentum',
+            full_name='paddle.MovingAverageRuleParameter.momentum',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2947,
+    serialized_end=2993, )
+
+_PSRESPONSEMESSAGE = _descriptor.Descriptor(
+    name='PsResponseMessage',
+    full_name='paddle.PsResponseMessage',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='err_code',
+            full_name='paddle.PsResponseMessage.err_code',
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=2,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='err_msg',
+            full_name='paddle.PsResponseMessage.err_msg',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=2,
+            has_default_value=True,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='data',
+            full_name='paddle.PsResponseMessage.data',
+            index=2,
+            number=3,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b(""),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2995,
+    serialized_end=3068, )
+
+_FSCLIENTPARAMETER = _descriptor.Descriptor(
+    name='FsClientParameter',
+    full_name='paddle.FsClientParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='fs_type',
+            full_name='paddle.FsClientParameter.fs_type',
+            index=0,
+            number=1,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='uri',
+            full_name='paddle.FsClientParameter.uri',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='user',
+            full_name='paddle.FsClientParameter.user',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='passwd',
+            full_name='paddle.FsClientParameter.passwd',
+            index=3,
+            number=4,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='buffer_size',
+            full_name='paddle.FsClientParameter.buffer_size',
+            index=4,
+            number=5,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='hadoop_bin',
+            full_name='paddle.FsClientParameter.hadoop_bin',
+            index=5,
+            number=51,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='afs_conf',
+            full_name='paddle.FsClientParameter.afs_conf',
+            index=6,
+            number=101,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=3071,
+    serialized_end=3284, )
+
+_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
+_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
+_PSPARAMETER.fields_by_name[
+    'trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER
+_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER
+_WORKERPARAMETER.fields_by_name[
+    'downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER
+_SERVERPARAMETER.fields_by_name[
+    'downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER
+_DOWNPOURWORKERPARAMETER.fields_by_name[
+    'downpour_table_param'].message_type = _TABLEPARAMETER
+_DOWNPOURTRAINERPARAMETER.fields_by_name[
+    'dense_table'].message_type = _DENSETABLEPARAMETER
+_DOWNPOURTRAINERPARAMETER.fields_by_name[
+    'sparse_table'].message_type = _SPARSETABLEPARAMETER
+_DOWNPOURSERVERPARAMETER.fields_by_name[
+    'downpour_table_param'].message_type = _TABLEPARAMETER
+_DOWNPOURSERVERPARAMETER.fields_by_name[
+    'service_param'].message_type = _SERVERSERVICEPARAMETER
+_TABLEPARAMETER.fields_by_name[
+    'accessor'].message_type = _TABLEACCESSORPARAMETER
+_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE
+_TABLEACCESSORPARAMETER.fields_by_name[
+    'sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER
+_TABLEACCESSORPARAMETER.fields_by_name[
+    'dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER
+_TABLEACCESSORPARAMETER.fields_by_name[
+    'downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER
+_TABLEACCESSORPARAMETER.fields_by_name[
+    'table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER
+_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER
+_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER
+_DENSESGDRULEPARAMETER.fields_by_name[
+    'summary'].message_type = _SUMMARYSGDPARAMETER
+_DENSESGDRULEPARAMETER.fields_by_name[
+    'moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER
+_FSCLIENTPARAMETER.fields_by_name[
+    'fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE
+_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER
+DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER
+DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER
+DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER
+DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER
+DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'ServerServiceParameter'] = _SERVERSERVICEPARAMETER
+DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'TableAccessorParameter'] = _TABLEACCESSORPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER
+DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE
+DESCRIPTOR.message_types_by_name[
+    'SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER
+DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER
+DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER
+DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER
+DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE
+DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER
+DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE
+DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID
+
+PSParameter = _reflection.GeneratedProtocolMessageType(
+    'PSParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_PSPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.PSParameter)
+    ))
+_sym_db.RegisterMessage(PSParameter)
+
+WorkerParameter = _reflection.GeneratedProtocolMessageType(
+    'WorkerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_WORKERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
+    ))
+_sym_db.RegisterMessage(WorkerParameter)
+
+ServerParameter = _reflection.GeneratedProtocolMessageType(
+    'ServerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SERVERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.ServerParameter)
+    ))
+_sym_db.RegisterMessage(ServerParameter)
+
+DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType(
+    'DownpourWorkerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DOWNPOURWORKERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
+    ))
+_sym_db.RegisterMessage(DownpourWorkerParameter)
+
+DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType(
+    'DownpourTrainerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DOWNPOURTRAINERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
+    ))
+_sym_db.RegisterMessage(DownpourTrainerParameter)
+
+DenseTableParameter = _reflection.GeneratedProtocolMessageType(
+    'DenseTableParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DENSETABLEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
+    ))
+_sym_db.RegisterMessage(DenseTableParameter)
+
+SparseTableParameter = _reflection.GeneratedProtocolMessageType(
+    'SparseTableParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SPARSETABLEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
+    ))
+_sym_db.RegisterMessage(SparseTableParameter)
+
+DownpourServerParameter = _reflection.GeneratedProtocolMessageType(
+    'DownpourServerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DOWNPOURSERVERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
+    ))
+_sym_db.RegisterMessage(DownpourServerParameter)
+
+ServerServiceParameter = _reflection.GeneratedProtocolMessageType(
+    'ServerServiceParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SERVERSERVICEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
+    ))
+_sym_db.RegisterMessage(ServerServiceParameter)
+
+TableParameter = _reflection.GeneratedProtocolMessageType(
+    'TableParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_TABLEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.TableParameter)
+    ))
+_sym_db.RegisterMessage(TableParameter)
+
+TableAccessorParameter = _reflection.GeneratedProtocolMessageType(
+    'TableAccessorParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_TABLEACCESSORPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
+    ))
+_sym_db.RegisterMessage(TableAccessorParameter)
+
+DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType(
+    'DownpourTableAccessorParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DOWNPOURTABLEACCESSORPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter)
+    ))
+_sym_db.RegisterMessage(DownpourTableAccessorParameter)
+
+TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType(
+    'TableAccessorSaveParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_TABLEACCESSORSAVEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter)
+    ))
+_sym_db.RegisterMessage(TableAccessorSaveParameter)
+
+PsRequestMessage = _reflection.GeneratedProtocolMessageType(
+    'PsRequestMessage',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_PSREQUESTMESSAGE,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
+    ))
+_sym_db.RegisterMessage(PsRequestMessage)
+
+SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
+    'SparseSGDRuleParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SPARSESGDRULEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
+    ))
+_sym_db.RegisterMessage(SparseSGDRuleParameter)
+
+DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
+    'DenseSGDRuleParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DENSESGDRULEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
+    ))
+_sym_db.RegisterMessage(DenseSGDRuleParameter)
+
+AdamSGDParameter = _reflection.GeneratedProtocolMessageType(
+    'AdamSGDParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_ADAMSGDPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
+    ))
+_sym_db.RegisterMessage(AdamSGDParameter)
+
+NaiveSGDParameter = _reflection.GeneratedProtocolMessageType(
+    'NaiveSGDParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_NAIVESGDPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
+    ))
+_sym_db.RegisterMessage(NaiveSGDParameter)
+
+SummarySGDParameter = _reflection.GeneratedProtocolMessageType(
+    'SummarySGDParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SUMMARYSGDPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
+    ))
+_sym_db.RegisterMessage(SummarySGDParameter)
+
+MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType(
+    'MovingAverageRuleParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_MOVINGAVERAGERULEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter)
+    ))
+_sym_db.RegisterMessage(MovingAverageRuleParameter)
+
+PsResponseMessage = _reflection.GeneratedProtocolMessageType(
+    'PsResponseMessage',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_PSRESPONSEMESSAGE,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
+    ))
+_sym_db.RegisterMessage(PsResponseMessage)
+
+FsClientParameter = _reflection.GeneratedProtocolMessageType(
+    'FsClientParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_FSCLIENTPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
+    ))
+_sym_db.RegisterMessage(FsClientParameter)
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(),
+                                                _b('\200\001\001'))
+# @@protoc_insertion_point(module_scope)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9e6cd1a0ab54d9fd2764b787416e4f5c86755a68..4d8311a0d3ada78e4f6cc54f8990e2a2e2cadc4d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -176,6 +176,7 @@ __all__ = [
     'get_tensor_from_selected_rows',
     'lstm',
     'psroi_pool',
+    'huber_loss',
 ]
 
 kIgnoreIndex = -100
@@ -497,7 +498,7 @@ def lstm(input,
     If Device is GPU, This op will use cudnn LSTM implementation
 
     A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
     the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
     $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
@@ -524,19 +525,19 @@ def lstm(input,
     - $\tilde{c_t}$ is also called candidate hidden state,
       which is computed based on the current input and the previous hidden state.
 
-    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
     X represensts a matrix multiplication
 
 
     Args:
         input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM                       
+        init_h(Variable): The initial hidden state of the LSTM
                        This is a tensor with shape ( num_layers x batch_size x hidden_size)
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
         init_c(Variable): The initial cell state of the LSTM.
                        This is a tensor with shape ( num_layers x batch_size x hidden_size )
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
         hidden_size (int): hidden size of the LSTM
         num_layers (int): total layers number of the LSTM
         dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -555,10 +556,10 @@ def lstm(input,
                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
         last_h(Tensor): the hidden state of the last step of LSTM
                         shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
         last_c(Tensor): the cell state of the last step of LSTM
                         shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
 
 
     Examples:
@@ -4658,7 +4659,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.5, 0.1, 0.3, 0.1]]
 
         input.lod = [[4, 4]]
-      
+
         Computation:
 
         step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
@@ -4691,7 +4692,7 @@ def ctc_greedy_decoder(input, blank, name=None):
     Returns:
         Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
                   'Lp' is the sum if all output sequences' length. If all the sequences
-                  in result were empty, the result LoDTensor will be [-1] with 
+                  in result were empty, the result LoDTensor will be [-1] with
                   LoD [[]] and dims [1, 1].
 
     Examples:
@@ -5045,7 +5046,7 @@ def hsigmoid(input,
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to 
+    complete binary tree, or you can use is_custom to pass your own tree to
     implement hierarchical. Each leaf node represents a class(a word) and each
     internal node acts as a binary classifier. For each word there's a unique
     path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -5061,7 +5062,7 @@ def hsigmoid(input,
         2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
         3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
          means label of each binary classification, using 1 indicate true, 0 indicate false.
-        4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+        4. now, each word should has its path and code along the path, you can pass a batch of path and code
         related to the same batch of inputs.
 
 
@@ -5071,8 +5072,8 @@ def hsigmoid(input,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
             which indicates the num of classes using by binary classify.
         param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
              of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
@@ -5085,15 +5086,15 @@ def hsigmoid(input,
              is not set, the bias is initialized zero. Default: None.
         name (str|None): A name for this layer(optional). If set None, the layer
              will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
+        path_table: (Variable|None) this variable can store each batch of samples' path to root,
             it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
-            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
-        path_code:  (Variable|None) this variable can store each batch of samples' code, 
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
+            structure and each element in this array is indexes in parent nodes' Weight Matrix.
+        path_code:  (Variable|None) this variable can store each batch of samples' code,
             each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
              set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
              of W and input will be sparse.
 
     Returns:
@@ -9377,3 +9378,51 @@ def psroi_pool(input,
             'pooled_width': pooled_width
         })
     return out
+
+
+def huber_loss(input, label, delta):
+    """
+    Huber loss is a loss function used in robust.
+    Huber loss can evaluate the fitness of input to label.
+    Different from MSE loss, Huber loss is more robust for outliers.
+
+    When the difference between input and label is large than delta
+    .. math::
+
+        huber\_loss = delta * (label - input) - 0.5 * delta * delta
+
+    When the difference between input and label is less than delta
+    .. math::
+
+        huber\_loss = 0.5 * (label - input) * (label - input)
+
+
+    Args:
+        input (Variable): This input is a probability computed by the previous operator.
+                          The first dimension is batch size, and the last dimension is 1.
+        label (Variable): The groud truth whose first dimension is batch size
+                          and last dimension is 1.
+        delta (float): The parameter of huber loss, which controls
+                       the range of outliers
+
+    Returns:
+        huber\_loss (Variable): The huber loss with shape [batch_size, 1].
+
+    Examples:
+        .. code-block:: python
+
+            predictions = fluid.layers.softmax(x)
+            loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0)
+    """
+    helper = LayerHelper('huber_loss', **locals())
+    residual = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    helper.append_op(
+        type='huber_loss',
+        inputs={'X': input,
+                'Y': label},
+        outputs={'Out': out,
+                 'Residual': residual},
+        attrs={'delta': delta})
+    return out
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 86f861674c26fe61e624103c2a0d70f816a1aebc..e2a9fc183ea9206efd892b23844081cb9d2fb3d3 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   seed=None,
                                   use_parallel_executor=True,
                                   use_reduce=False,
+                                  use_ir_memory_optimize=False,
                                   fuse_elewise_add_act_ops=False,
                                   optimizer=fluid.optimizer.Adam,
                                   use_fast_executor=False,
@@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
             build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+            build_strategy.memory_optimize = use_ir_memory_optimize
             build_strategy.enable_sequential_execution = enable_sequential_execution
             if use_cuda and core.is_compiled_with_cuda():
                 build_strategy.remove_unnecessary_lock = True
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
index e91cfe0b45ab7e4e56fccf8d49eb381fbbd199d1..89476ee641f1dd295a3caca89ac41038cad317f2 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -39,6 +39,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
     cost = network(data, label, len(word_dict))
+    cost.persistable = True
     optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
     optimizer.minimize(cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca65c5d3b689612f6624a7e0e16c4dabbae1738
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+
+
+def _feed_data_helper(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+    return img, label
+
+
+def simple_fc_net(use_feed):
+    x, y = _feed_data_helper(use_feed)
+    hidden_layer = 4
+    for _ in range(hidden_layer):
+        x = fluid.layers.fc(input=x, size=20, act='relu')
+    y_predict = fluid.layers.fc(input=x, size=10, act='softmax')
+    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+
+
+def fc_with_inplace_net(use_feed):
+    x, y = _feed_data_helper(use_feed)
+    fc = fluid.layers.fc(input=x, size=20, act='relu')
+    fc = fluid.layers.fc(input=fc, size=10, act='relu')
+    reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5])
+    reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2])
+    y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax')
+    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
+    def _dummy_data(self):
+        np.random.seed(5)
+        img = np.random.random(size=[32, 784]).astype(np.float32)
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    def _compare_ir_and_python_memory_optimize(self, model, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._dummy_data()
+        first_loss0, last_loss0 = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            memory_opt=False,
+            use_ir_memory_optimize=False)
+        first_loss1, last_loss1 = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            memory_opt=False,
+            use_ir_memory_optimize=True)
+        for loss in zip(first_loss0, first_loss1):
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
+        for loss in zip(last_loss0, last_loss1):
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
+
+    def test_simple_fc_net(self):
+        self._compare_ir_and_python_memory_optimize(simple_fc_net, False)
+        self._compare_ir_and_python_memory_optimize(simple_fc_net, True)
+
+    def test_fc_with_reshape_net(self):
+        self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False)
+        self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 95aafec05361a8b66b849268c7a738bb2ee5da86..7b530ba617498c79edc52ef177990410daf6b2a4 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -43,6 +43,7 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
                   ("conditional_block", "conditional_block_grad")]
 
 PRINT_LOG = False
+FLAGS_memory_optimize = ""
 
 
 class OrderedSet(MutableSet):
@@ -121,6 +122,7 @@ class ControlFlowGraph(object):
         self._defs = defaultdict(OrderedSet)
         self._live_in = defaultdict(OrderedSet)
         self._live_out = defaultdict(OrderedSet)
+
         self._skip_opt = skip_opt
         self.pool = []
 
@@ -144,7 +146,6 @@ class ControlFlowGraph(object):
         for i in range(self.op_size):
             self._uses[i].update(self._ops[i].input_arg_names())
             self._defs[i].update(self._ops[i].output_arg_names())
-            self._live_in[i] = self._uses[i]
 
     def _update_graph(self, old_name, new_name, begin_idx=0):
         for i in range(begin_idx, self.op_size):
@@ -177,20 +178,52 @@ class ControlFlowGraph(object):
                     worklist.append(d)
 
     def _fill_pool(self, i, is_forward):
+        def comparator(x, cache):
+            x_shape = x[1]
+            cache_shape = cache[1]
+            x_size = abs(reduce(lambda x, y: x * y, x_shape))
+            cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
+            if (x_shape[0] == -1 and cache_shape[0] == -1) or \
+               (x_shape[0] != -1 and cache_shape[0] != -1) :
+                return x_size <= cache_size
+            else:
+                return False
+
+        def find_var_in_block(x):
+            known_vars = set()
+            for op in self._ops:
+                known_vars.update(op.output_arg_names())
+            return x in known_vars
+
         block_desc = self._ops[i].block()
         in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
         # NOTE: must sort the in_diff set for cases that get different cache var.
         # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
         can_optimize = [
-            x for x in in_diff
+            x for x in sorted(in_diff)
             if self._check_var_validity(block_desc, x, is_forward)
         ]
         if can_optimize:
             for var_name in can_optimize:
                 cache = (var_name, self._find_var(block_desc, var_name,
                                                   is_forward).shape())
-                if cache not in self.pool:
-                    self.pool.append(cache)
+                if cache not in self.pool and find_var_in_block(var_name):
+                    i = 0
+                    while i < len(self.pool):
+                        mycache = self.pool[i]
+                        mysize = mycache[1][0]
+                        cache_size = cache[1][0]
+                        if (mysize == -1 and cache_size == -1) or \
+                           (mysize != -1 and cache_size != -1):
+                            if comparator(mycache, cache):
+                                i += 1
+                            else:
+                                break
+                        elif mysize == -1 and cache_size != -1:
+                            i += 1
+                        elif mysize != -1 and cache_size == -1:
+                            break
+                    self.pool.insert(i, cache)
 
     def _get_diff(self, a, b):
         u = a & b
@@ -229,7 +262,7 @@ class ControlFlowGraph(object):
     def _update_skip_opt_set(self):
         for i in range(self.op_size):
             op = self._ops[i]
-            if op.type() == "fill_constant" and op.attr("force_cpu") == True:
+            if op.has_attr("force_cpu") and op.attr("force_cpu") == True:
                 self._skip_opt.update(op.output_arg_names())
 
     def release_memory(self, skip_opt_set=None):
@@ -281,6 +314,7 @@ class ControlFlowGraph(object):
         # update skip set to meet users' demand
         if skip_opt_set:
             self._skip_opt.update(skip_opt_set)
+        counter = 0
         for i in range(self.op_size):
             op = self._ops[i]
             if op.type() in SUB_BLOCK_OPS:
@@ -301,6 +335,9 @@ class ControlFlowGraph(object):
                     # If x is both in uses and defs, it can not be optimized!
                     if x in self._uses[i]:
                         continue
+                    if x == FLAGS_memory_optimize:
+                        print("start match var ", x, " of op ", op.type())
+                        print(self.pool)
                     for index, cache_pair in enumerate(self.pool):
                         cache_var = cache_pair[0]
                         cache_shape = cache_pair[1]
@@ -323,15 +360,13 @@ class ControlFlowGraph(object):
                         if not compare_shape(x_shape, cache_shape, level):
                             continue
                         # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
-                        if x_dtype != cache_dtype:
-                            continue
-
                         if PRINT_LOG:
-                            print(("Hit Cache !!!! cache pool index "
-                                   "is %d, var name is %s, "
-                                   "cached var name is %s, "
-                                   "var shape is %s ") % (index, x, cache_var,
-                                                          str(cache_shape)))
+                            print(
+                                ("!!! %d,  %s => %s, cache idx %d, pool size %d"
+                                 % (counter, x + str(x_shape),
+                                    cache_var + str(cache_shape), index,
+                                    len(self.pool))))
+                            counter += 1
                         self.pool.pop(index)
                         # Rename the var to the cache var already with
                         # memory allocated in order to reuse the memory.
@@ -484,8 +519,11 @@ def memory_optimize(input_program,
 
     if level != 0 and level != 1:
         raise ValueError("only support opt_level 0 or 1.")
-    if skip_opt_set is not None and not isinstance(skip_opt_set, set):
-        raise ValueError("only support skip_opt_set as set.")
+    if skip_opt_set is not None:
+        if isinstance(skip_opt_set, set) or isinstance(skip_opt_set, list):
+            skip_opt_set = set(skip_opt_set)
+        else:
+            raise ValueError("only support skip_opt_set as set.")
     global PRINT_LOG
     PRINT_LOG = print_log
     if skip_grads:
diff --git a/python/setup.py.in b/python/setup.py.in
index 0eb69cdb5c7d140527dba7a648728750bfb404f7..cf8f28bd250d82c18b296a4d01c3a9856d801b8c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -104,8 +104,10 @@ packages=['paddle',
           'paddle.fluid.imperative',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
+          'paddle.fluid.distributed',
           'paddle.fluid.layers',
           'paddle.fluid.contrib',
+          'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
           'paddle.fluid.transpiler',
@@ -241,5 +243,6 @@ setup(name='${PACKAGE_NAME}',
       ext_modules=ext_modules,
       package_data=package_data,
       package_dir=package_dir,
-      scripts=paddle_bins
+      scripts=paddle_bins,
+      distclass=BinaryDistribution
 )