Merge remote-tracking branch 'upstream/develop' into windows/fixgpuissue

92da467c · peizhilin · d54133ea · fd854183 · 92da467c · 92da467c
169 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
+option(WITH_JEMALLOC    "Compile PaddlePaddle with jemalloc"            OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
@@ -261,6 +262,12 @@ if (WITH_PROFILER)
    add_definitions(-DWITH_GPERFTOOLS)
 endif()

+if (WITH_JEMALLOC)
+    find_package(JeMalloc REQUIRED)
+    include_directories(${JEMALLOC_INCLUDE_DIR})
+    add_definitions(-DWITH_JEMALLOC)
+endif()
+
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
@@ -290,7 +297,7 @@ if(WITH_PSLIB)
    list(APPEND EXTERNAL_LIBS pslib_brpc)
    list(APPEND EXTERNAL_LIBS libmct)
 endif(WITH_PSLIB)
-    
+
 if(WITH_AMD_GPU)
    find_package(HIP)
    include(hip)

--- a/cmake/FindJeMalloc.cmake
+++ b/cmake/FindJeMalloc.cmake
+# - Find JeMalloc library
+# Find the native JeMalloc includes and library
+#
+# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
+# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
+# JEMALLOC_FOUND - True if jemalloc found.
+
+find_path(JEMALLOC_INCLUDE_DIR
+  NAMES jemalloc/jemalloc.h
+  HINTS ${JEMALLOC_ROOT_DIR}/include)
+
+find_library(JEMALLOC_LIBRARIES
+  NAMES jemalloc
+  HINTS ${JEMALLOC_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
+
+mark_as_advanced(
+  JEMALLOC_LIBRARIES
+  JEMALLOC_INCLUDE_DIR)
+
+if (JEMALLOC_FOUND)
+  add_library(jemalloc::jemalloc UNKNOWN IMPORTED)
+  set_target_properties(jemalloc::jemalloc PROPERTIES
+    IMPORTED_LOCATION ${JEMALLOC_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}")
+endif()
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -134,6 +134,7 @@ if(WITH_GPU)
            message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
        endif()
+        add_definitions(-DWITH_ANAKIN)
    endif()
    if(WITH_ANAKIN)
        # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -5,6 +5,8 @@ endif()
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")

 ######################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@@ -59,7 +61,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
  set(archs_name_default "All")
  if(NOT CMAKE_CROSSCOMPILING)
    list(APPEND archs_names "Auto")
@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable)
    set(cuda_arch_bin "60 61")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
+    set(cuda_arch_bin "75")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
+elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
+elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()

 include_directories(${CUDA_INCLUDE_DIRS})

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -23,11 +23,8 @@ set(BOOST_PROJECT       "extern_boost")
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
-if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
-    message(STATUS "use pre defined download url")
-    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
-endif()
+set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
+set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)

 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")


--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -55,7 +55,7 @@ ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
-    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_REPOSITORY      "https://github.com/intel/mkl-dnn.git"
    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})

+IF(APPLE)
+    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
+    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-if(WIN32)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+
+SET(TIME_VERSION "2019.0.1.20181227")
+IF(WIN32)
+    SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-else()
+ELSE()  
+    SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
+    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
-endif()
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
-
-IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-    MESSAGE(STATUS "use pre defined download url")
-    if(WIN32)
-        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
-    elseif(APPLE)
-        SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    else()
-        SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    ENDIF()
-endif()
+ENDIF()

 SET(MKLML_PROJECT       "extern_mklml")
 MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")

--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)

 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "v0.10.1")
+SET(NGRAPH_GIT_TAG         "20bd8bbc79ae3a81c57313846a2be7313e5d1dab")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
-SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    SET(NGRAPH_TBB_LIB_NAME    libtbb_debug.so.2)
+else()
+    SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
+endif()
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
 SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
 SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
@@ -66,16 +70,7 @@ ExternalProject_Add(
    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
    CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
-)
-
-# Workaround for nGraph expecting mklml to be in mkldnn install directory.
-ExternalProject_Add_Step(
-    ${NGRAPH_PROJECT}
-    PrepareMKL
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
-    DEPENDEES download
-    DEPENDERS configure
+    CMAKE_ARGS          -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
 )

 add_dependencies(ngraph ${NGRAPH_PROJECT})

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -115,6 +115,10 @@ function(common_link TARGET_NAME)
  if (WITH_PROFILER)
    target_link_libraries(${TARGET_NAME} gperftools::profiler)
  endif()
+
+  if (WITH_JEMALLOC)
+    target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
+  endif()
 endfunction()


@@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME)
      # Get the file names of the libraries to be merged
      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
    endforeach()
-    # msvc will put libarary in directory of "/Release/xxxlib" by default 
+    # msvc will put libarary in directory of "/Release/xxxlib" by default
    #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None
 paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
 paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
 paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
+paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
+paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
+paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
+paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
+paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
+paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
+paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
 paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
+paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fuse_elewise_add_act_pass multi_batch_merge_pass
-        memory_optimize_pass)
+        memory_optimize_pass lock_free_optimize_pass)
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <memory>

 #include "paddle/fluid/framework/details/memory_reuse_types.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
@@ -86,10 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    if (strategy.memory_optimize_) {
      auto analysis_var_pass = AppendPass("analysis_var_pass");
    }
-    // Convert graph to run on multi-devices.
-    auto multi_devices_pass = AppendPass("multi_devices_pass");
-    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
-                                                         &strategy_);
+
+    AppendMultiDevPass(strategy);

    // Add a graph print pass to record a graph with device info.
    if (!strategy_.debug_graphviz_path_.empty()) {
@@ -115,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    }
  }

+  // Convert graph to run on multi-devices.
+  void AppendMultiDevPass(const BuildStrategy &strategy) {
+    ir::Pass *multi_devices_pass;
+    if (strategy_.is_distribution_) {
+      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
+    } else {
+      if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+        multi_devices_pass =
+            AppendPass("allreduce_mode_multi_devices_pass").get();
+      } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+        multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
+      } else {
+        PADDLE_THROW("Unknown reduce strategy.");
+      }
+    }
+    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
+                                                         &strategy_);
+  }
+
 private:
  BuildStrategy strategy_;
 };
@@ -131,6 +148,10 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
  return pass_builder_;
 }

+bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
+  return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
+}
+
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
@@ -145,22 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(

  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
-    if (pass->Type() == "multi_devices_pass") {
-      pass->Erase("places");
-      pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
-      pass->Erase("loss_var_name");
-      pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
-      pass->Erase("local_scopes");
-      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+    if (IsMultiDevPass(pass->Type())) {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLossVarName);
+      pass->SetNotOwned<const std::string>(kLossVarName, &loss_var_name);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                    &local_scopes);
-      pass->Erase("nranks");
-      pass->Set<size_t>("nranks", new size_t(nranks));
+      pass->Erase(kNRanks);
+      pass->Set<size_t>(kNRanks, new size_t(nranks));

 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+
    } else if (pass->Type() == "analysis_var_pass") {
      const std::vector<OpDesc *> *all_op_descs =
          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
@@ -201,10 +223,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
-USE_PASS(multi_devices_pass);
+USE_PASS(reduce_mode_multi_devices_pass);
+USE_PASS(allreduce_mode_multi_devices_pass);
+USE_PASS(dist_multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
 USE_PASS(analysis_var_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
+USE_PASS(lock_free_optimize_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,8 +74,6 @@ struct BuildStrategy {

  bool fuse_elewise_add_act_ops_{false};

-  bool enable_data_balance_{false};
-
  bool memory_optimize_{false};

  bool memory_early_delete_{false};
@@ -84,6 +82,10 @@ struct BuildStrategy {

  bool fuse_broadcast_op_{false};

+  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
+  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
+  // it's distributed model.
+  bool is_distribution_{false};
  int num_trainers_{1};
  int trainer_id_{0};
  std::vector<std::string> trainers_endpoints_;
@@ -104,6 +106,8 @@ struct BuildStrategy {

  bool IsFinalized() const { return is_finalized_; }

+  bool IsMultiDevPass(const std::string &pass_name) const;
+
  // Apply the passes built by the pass_builder_. The passes will be
  // applied to the Program and output an ir::Graph.
  std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,

--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include <string>
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"

@@ -21,68 +21,78 @@ namespace paddle {
 namespace framework {
 namespace details {

-bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
-  std::unordered_map<OpHandleBase *, size_t> pending_ops;
-  std::unordered_set<VarHandleBase *> pending_vars;
-  std::unordered_set<VarHandleBase *> ready_vars;
-  std::unordered_set<OpHandleBase *> ready_ops;
+class SSAGraghBuilderWithChecker : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
+  }

-  auto insert_pending_var = [&](VarHandleBase *var) {
-    pending_vars.insert(var);
-    if (var->GeneratedOp() == nullptr) {
-      ready_vars.emplace(var);
-    }
-  };
+  bool IsValidGraph(const ir::Graph *graph) const {
+    std::unordered_map<OpHandleBase *, size_t> pending_ops;
+    std::unordered_set<VarHandleBase *> pending_vars;
+    std::unordered_set<VarHandleBase *> ready_vars;
+    std::unordered_set<OpHandleBase *> ready_ops;

-  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        insert_pending_var(version_pair);
+    auto insert_pending_var = [&](VarHandleBase *var) {
+      pending_vars.insert(var);
+      if (var->GeneratedOp() == nullptr) {
+        ready_vars.emplace(var);
      }
-    }
-  }
+    };

-  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
-    insert_pending_var(var);
-  }
+    for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
+      for (auto &name_pair : var_map) {
+        for (auto &version_pair : name_pair.second) {
+          insert_pending_var(version_pair);
+        }
+      }
+    }

-  for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
-    if (op->Inputs().empty()) {
-      ready_ops.insert(op);
-    } else {
-      pending_ops.insert({op, op->NoDupInputSize()});
+    for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
+      insert_pending_var(var);
    }
-  }

-  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
-    for (auto *op : set) {
-      for (auto out : op->Outputs()) {
-        ready_vars.emplace(out);
+    for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
+      if (op->Inputs().empty()) {
+        ready_ops.insert(op);
+      } else {
+        pending_ops.insert({op, op->NoDupInputSize()});
      }
    }
-    set.clear();
-  };

-  while (!pending_vars.empty()) {
-    run_all_ops(ready_ops);
+    auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+      for (auto *op : set) {
+        for (auto out : op->Outputs()) {
+          ready_vars.emplace(out);
+        }
+      }
+      set.clear();
+    };

-    if (ready_vars.empty()) {
-      return false;
-    }
+    while (!pending_vars.empty()) {
+      run_all_ops(ready_ops);

-    for (auto ready_var : ready_vars) {
-      pending_vars.erase(ready_var);
-      for (auto *op : ready_var->PendingOps()) {
-        auto &deps = --pending_ops[op];
-        if (deps == 0) {
-          ready_ops.insert(op);
+      if (ready_vars.empty()) {
+        return false;
+      }
+
+      for (auto ready_var : ready_vars) {
+        pending_vars.erase(ready_var);
+        for (auto *op : ready_var->PendingOps()) {
+          auto &deps = --pending_ops[op];
+          if (deps == 0) {
+            ready_ops.insert(op);
+          }
        }
      }
+      ready_vars.clear();
    }
-    ready_vars.clear();
+    return true;
  }
-  return true;
-}
+};
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+
 #include <string>
 #include <utility>
 #include <vector>
@@ -30,78 +31,70 @@ namespace framework {
 class Scope;
 namespace details {

-class MultiDevSSAGraphBuilder : public ir::Pass {
+constexpr char kLossVarName[] = "loss_var_name";
+constexpr char kPlaces[] = "places";
+constexpr char kLocalScopes[] = "local_scopes";
+constexpr char kStrategy[] = "strategy";
+constexpr char kNRanks[] = "nranks";
+
+class MultiDevSSAGraphBuilderBase : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;

- private:
-  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
-                         size_t device_id) const;
-  void Init() const;
+  virtual void Init() const;

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  mutable platform::NCCLContextMap *nccl_ctxs_;
-#endif
+  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;

-  int GetVarDeviceID(
-      const std::string &varname,
-      const std::unordered_map<std::string, int> &sharded_var_device) const;
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const = 0;

-  bool IsScaleLossOp(ir::Node *node) const;
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const = 0;

-  int CreateRPCOp(
-      ir::Graph *result, ir::Node *node,
-      std::unordered_map<std::string, int> *sharded_var_device) const;
-  int CreateDistTrainOp(
-      ir::Graph *result, ir::Node *node,
-      std::unordered_map<std::string, int> *sharded_var_device) const;
+  bool UseGPU() const;
+
+  bool NeedCollectiveOps() const;
+
+  bool IsScaleLossOp(ir::Node *node) const;

  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                              size_t num_places) const;

  void CreateScaleLossGradOp(ir::Graph *result,
                             const std::string &loss_grad_name,
-                             ir::Node *out_var_node,
+                             ir::Node *out_var_node, size_t loss_scale,
                             proto::VarType::Type dtype) const;

  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                            int dst_dev_id) const;
+
  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                             int dev_id) const;

-  int GetOpDeviceID(
-      ir::Node *node,
-      const std::unordered_map<std::string, int> &sharded_var_device) const;
-
-  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
+  bool IsSparseGradient(const std::string &og) const;

-  void InsertDataBalanceOp(ir::Graph *result,
-                           const std::vector<std::string> &datas) const;
+  void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;

  void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                         size_t src_dev_id) const;

+  void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const;
+
  void CreateFusedBroadcastOp(
      ir::Graph *result,
      const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;

-  bool IsSparseGradient(const std::string &og) const;
-
-  size_t GetAppropriateDeviceID(
-      const std::vector<std::string> &var_names) const;
-
  void SetCommunicationContext(OpHandleBase *op_handle,
                               const platform::Place &p) const;

-  std::vector<ir::Node *> SortForReduceMode(
-      const std::vector<ir::Node *> &) const;
+  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
+                         size_t device_id) const;

-  int GetOpDeviceID(
-      ir::Node *node,
-      const std::unordered_map<std::string, int> &shared_var_device,
-      std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
-      const;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  mutable platform::NCCLContextMap *nccl_ctxs_;
+#endif

  mutable std::string loss_var_name_;
  mutable std::vector<platform::Place> places_;
@@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass {

  mutable BuildStrategy strategy_;
  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
+};
+
+class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
+ protected:
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
+    return false;
+  }
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const {}
+};
+
+class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
+ protected:
+  int GetVarDeviceID(const std::string &varname) const;
+
+  int GetOpDeviceID(ir::Node *node) const;
+
+  size_t GetAppropriateDeviceID(
+      const std::vector<std::string> &var_names) const;
+
+  virtual void ResetState() const;
+
+  mutable std::unordered_map<std::string, int> sharded_var_device_;
  mutable std::vector<int64_t> balance_vars_;
 };
+
+class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
+ protected:
+  virtual void Init() const;
+
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const;
+
+  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
+
+  virtual void ResetState() const;
+
+  int GetOpDeviceID(ir::Node *node,
+                    std::unordered_map<std::string, std::vector<ir::Node *>>
+                        *delay_ops) const;
+
+  std::vector<ir::Node *> SortForReduceMode(
+      const std::vector<ir::Node *> &topo_ops) const;
+
+  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
+};
+
+class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
+ protected:
+  virtual void Init() const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const;
+
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual void ResetState() const;
+
+  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+
+  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
+
+  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
+  mutable bool need_broadcast_var_{false};
+};
+
+std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)

 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
+pass_library(lock_free_optimize_pass base)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -41,6 +42,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
 pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
+pass_library(seqpool_concat_fuse_pass inference)
 pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -109,7 +109,6 @@ class Graph {
    attr_dels_[attr_name] = []() {};
  }

-  template <typename AttrType>
  void Erase(const std::string &attr_name) {
    PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph",
                   attr_name);

--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h"
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+const char kSumGradOpName[] = "sum";
+// TODO(minqiyang): only support sgd at current time, please add
+// other optimizers later.
+const char kOptimizerType[] = "sgd";
+
+std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+
+  // We could collect all weights' name from SGD, where
+  // W1 <- SGD(W0, Grad0)
+  std::unordered_set<std::string> weight_var_set;
+  for (auto* node : graph->Nodes()) {
+    if (IsOpNamed(node, kOptimizerType)) {
+      auto& param_out_vars = node->Op()->Output("ParamOut");
+      PADDLE_ENFORCE(param_out_vars.size() == 1u);
+      weight_var_set.insert(param_out_vars[0]);
+    }
+  }
+
+  // find all grad's merge op via weight name, where
+  // Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
+  std::unordered_set<ir::Node*> grad_sum_op_set;
+  for (ir::Node* node : graph->Nodes()) {
+    if (IsOpNamed(node, kSumGradOpName)) {
+      for (ir::Node* output : node->outputs) {
+        // strip the last grad suffix @GRAD
+        std::string var_name = output->Name();
+        const std::string suffix(kGradVarSuffix);
+        if (var_name != suffix && var_name.size() > suffix.size() &&
+            var_name.substr(var_name.size() - suffix.size()) == suffix) {
+          // if so then strip them off
+          var_name = var_name.substr(0, var_name.size() - suffix.size());
+          if (weight_var_set.find(var_name) != weight_var_set.end()) {
+            grad_sum_op_set.insert(node);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // get the forward op and backward op pairs, where
+  // out <- forward(X, W)
+  // Grad1 <- backward(out, X')
+  // Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
+  // W0 <- SGD(W1, Grad0)
+  for (ir::Node* node : grad_sum_op_set) {
+    for (ir::Node* merged_grad_var : node->outputs) {
+      // find the optimizers connected with sum op
+      if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) &&
+          merged_grad_var->outputs.size() == 1u) {
+        ir::Node* opt_node = merged_grad_var->outputs[0];
+        VLOG(3) << "Found opt node " << opt_node->Name();
+
+        // find the backward op connected with sum op
+        for (ir::Node* unmerged_grad_var : node->inputs) {
+          if (IsVarNameContains(unmerged_grad_var, kGradVarSuffix) &&
+              unmerged_grad_var->inputs.size() == 1u) {
+            ir::Node* backward_op = unmerged_grad_var->inputs[0];
+
+            VLOG(3) << "Found backward_op " << backward_op->Name();
+
+            // find the forward op related to the backward op
+            ir::Node* forward_op =
+                FindForwardOpViaBackwardOp(graph.get(), backward_op);
+
+            VLOG(3) << "Found forward_op " << forward_op->Name();
+
+            PADDLE_ENFORCE(forward_op);
+
+            Node* new_optimizer_node = CreateNewSGDNode(
+                graph.get(), forward_op, backward_op, node, opt_node);
+
+            PADDLE_ENFORCE(new_optimizer_node);
+          }
+        }
+      }
+    }
+  }
+
+  // Remove the sum_op and its' outputs and connected Optimizers
+  for (Node* sum_op : grad_sum_op_set) {
+    for (Node* sum_op_output : sum_op->outputs) {
+      for (Node* optimize_op : sum_op_output->outputs) {
+        if (optimize_op->NodeType() == Node::Type::kOperation &&
+            optimize_op->Name() == kOptimizerType) {
+          VLOG(3) << "remove optimize_op: " << optimize_op->Name() << "_"
+                  << optimize_op->id();
+          graph->RemoveNode(optimize_op);
+        }
+      }
+      VLOG(3) << "remove sum_op_output: " << sum_op_output->Name() << "_"
+              << sum_op_output->id();
+      graph->RemoveNode(sum_op_output);
+    }
+    VLOG(3) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id();
+    graph->RemoveNode(sum_op);
+  }
+
+  for (auto* node : graph->Nodes()) {
+    for (Node* output_node : node->outputs) {
+      if (output_node->Name() == "sgd") {
+        VLOG(3) << "Node link to SGD: " << node->Name() << "_" << node->id()
+                << " --> " << output_node->Name() << "_" << output_node->id();
+        for (Node* input_node : node->inputs) {
+          VLOG(3) << "SGD Input link: " << input_node->Name() << "_"
+                  << input_node->id() << " --> " << node->Name() << "_"
+                  << node->id();
+        }
+      }
+    }
+  }
+
+  return graph;
+}
+
+ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
+    ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node,
+    ir::Node* grad_sum_node, ir::Node* optimize_node) const {
+  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE(forward_node);
+  PADDLE_ENFORCE(backward_node);
+  PADDLE_ENFORCE(grad_sum_node);
+  PADDLE_ENFORCE(optimize_node);
+
+  // find the grad var node between the grad sum node and backward_node
+  std::vector<ir::Node*> grad_vars =
+      FindConnectedNode(backward_node, grad_sum_node);
+  ir::Node* grad_node = nullptr;
+  for (ir::Node* node : grad_vars) {
+    if (!ir::IsControlDepVar(*node)) {
+      grad_node = node;
+    }
+  }
+  PADDLE_ENFORCE(grad_node);
+
+  // create a new SGD node
+  OpDesc* old_desc = optimize_node->Op();
+  // keep with the same block between new optimizer and the old one
+  OpDesc new_desc(*old_desc, old_desc->Block());
+  new_desc.SetInput("Param", old_desc->Input("Param"));
+  new_desc.SetInput("LearningRate", old_desc->Input("LearningRate"));
+  new_desc.SetInput("Grad", std::vector<std::string>({grad_node->Name()}));
+  new_desc.SetOutput("ParamOut", old_desc->Output("ParamOut"));
+
+  std::vector<std::string> op_role_vars = boost::get<std::vector<std::string>>(
+      new_desc.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+  // replace the second op role var, because the grad name was
+  // changed in new optimizer
+  op_role_vars.pop_back();
+  op_role_vars.push_back(grad_node->Name());
+  new_desc.SetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                   op_role_vars);
+  new_desc.SetType(kOptimizerType);
+
+  // set backward op's op role var, this will be used to
+  // set device_id in multi_device_pass
+  backward_node->Op()->SetAttr(
+      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), op_role_vars);
+  // backward_node->Op()->SetAttr(
+  // framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {});
+
+  // keep with the same output nodes between new optimizer and the
+  // old one
+  Node* sgd_node = graph->CreateOpNode(&new_desc);
+
+  // change all outputs of the optimize_node to the new one
+  ReplaceAllDownstreamNode(optimize_node, sgd_node);
+
+  // find connected node between forward node and optimize node
+  // and replace the optimize node to new sgd node
+  std::vector<ir::Node*> forward_opt_connected_nodes =
+      FindConnectedNode(forward_node, optimize_node);
+  for (ir::Node* node : forward_opt_connected_nodes) {
+    ReplaceUpstreamNode(node, optimize_node, sgd_node);
+  }
+
+  // find connected node between backward node and optimize node
+  // and replace the optimize node to new sgd node
+  std::vector<ir::Node*> backward_opt_connected_nodes =
+      FindConnectedNode(backward_node, optimize_node);
+  for (ir::Node* node : backward_opt_connected_nodes) {
+    ReplaceUpstreamNode(node, optimize_node, sgd_node);
+  }
+
+  // SGD must have only one param and LR in
+  PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u);
+  PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u);
+
+  // LR and weight nodes should be copied
+  for (Node* upstream_node : optimize_node->inputs) {
+    if (upstream_node->Name() == old_desc->Input("LearningRate")[0] ||
+        upstream_node->Name() == old_desc->Input("Param")[0]) {
+      ReplaceUpstreamNode(upstream_node, optimize_node, sgd_node);
+    }
+  }
+
+  VLOG(3) << "Create new opt node" << sgd_node->Name() << "_" << sgd_node->id();
+
+  return sgd_node;
+}
+
+std::vector<ir::Node*> LockFreeOptimizePass::FindConnectedNode(
+    ir::Node* upstream_node, ir::Node* downstream_node) const {
+  std::vector<ir::Node*> result;
+  for (ir::Node* out_node : upstream_node->outputs) {
+    for (ir::Node* in_node : downstream_node->inputs) {
+      if (in_node == out_node) {
+        result.push_back(in_node);
+      }
+    }
+  }
+
+  return result;
+}
+
+void LockFreeOptimizePass::ReplaceUpstreamNode(
+    ir::Node* upstream_node, ir::Node* old_optimizer_node,
+    ir::Node* new_optimizer_node) const {
+  PADDLE_ENFORCE(upstream_node);
+  PADDLE_ENFORCE(old_optimizer_node);
+  PADDLE_ENFORCE(new_optimizer_node);
+
+  // Remove the old_optimizer_node from upstream_node's outputs vector
+  auto& output_node_vec = upstream_node->outputs;
+  for (auto output_node_iter = output_node_vec.begin();
+       output_node_iter != output_node_vec.end();) {
+    if (*output_node_iter == old_optimizer_node) {
+      output_node_vec.erase(output_node_iter);
+      break;
+    } else {
+      ++output_node_iter;
+    }
+  }
+
+  // Add the new_optimizer_node to upstream_node's outputs vector
+  output_node_vec.emplace_back(new_optimizer_node);
+  new_optimizer_node->inputs.emplace_back(upstream_node);
+}
+
+void LockFreeOptimizePass::ReplaceAllDownstreamNode(
+    ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const {
+  PADDLE_ENFORCE(old_optimizer_node);
+  PADDLE_ENFORCE(new_optimizer_node);
+
+  for (ir::Node* downstream_node : old_optimizer_node->outputs) {
+    // Remove the old_optimizer_node from downstream_node's inputs vector
+    auto& input_node_vec = downstream_node->inputs;
+    for (auto input_node_iter = input_node_vec.begin();
+         input_node_iter != input_node_vec.end();) {
+      if (*input_node_iter == old_optimizer_node) {
+        input_node_vec.erase(input_node_iter);
+        break;
+      } else {
+        ++input_node_iter;
+      }
+    }
+
+    // Add the new_optimizer_node to downstream_node's inputs vector
+    input_node_vec.emplace_back(new_optimizer_node);
+    new_optimizer_node->outputs.emplace_back(downstream_node);
+  }
+}
+
+ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp(
+    ir::Graph* graph, ir::Node* backward_node) const {
+  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE(backward_node);
+
+  // strip the suffix _grad of backward_node's name
+  std::string forward_op_name = backward_node->Name();
+  const std::string suffix("_grad");
+  if (forward_op_name != suffix && forward_op_name.size() > suffix.size() &&
+      forward_op_name.substr(forward_op_name.size() - suffix.size()) ==
+          suffix) {
+    // if so then strip them off
+    forward_op_name =
+        forward_op_name.substr(0, forward_op_name.size() - suffix.size());
+  } else {
+    LOG(WARNING) << "Illegal backward node's name " << backward_node->Name()
+                 << " id " << backward_node->id();
+
+    return nullptr;
+  }
+
+  for (ir::Node* node : graph->Nodes()) {
+    if (node->Name() == forward_op_name) {
+      if (node->outputs.size() == 0u) {
+        // if forward_node has no output, then it has NO grad op
+        continue;
+      }
+
+      // check whether all inputs of the backward_op that ends_with @GRAD
+      // comes from the output of forward_op is the input of the backward_op
+      bool is_related_forward_node = true;
+      for (ir::Node* backward_input : backward_node->inputs) {
+        if (IsVarNameEndsWith(backward_input, kGradVarSuffix)) {
+          bool meets_correct_output = false;
+          for (ir::Node* forward_output : node->outputs) {
+            if (forward_output->Name() + kGradVarSuffix ==
+                backward_input->Name()) {
+              meets_correct_output = true;
+              break;
+            }
+          }
+
+          if (!meets_correct_output) {
+            is_related_forward_node = false;
+            break;
+          }
+        }
+      }
+
+      if (is_related_forward_node) {
+        return node;
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(lock_free_optimize_pass,
+              paddle::framework::ir::LockFreeOptimizePass);
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
+#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
+
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node;
+
+/*
+* Remove the sum op of all gradients of the backward op.
+* And remove the dependecies of the optimizer related to the
+* same backward op.
+*
+* Before this pass:
+*
+* forward_op1 forward_op2
+*     |            |
+*  grad_op1    grad_op2
+*        \      /
+*          \  /
+*         sum_op
+*           |
+*         sgd_op
+*
+* After this pass:
+* forward_op1 forward_op2
+*     |            |
+*  grad_op1    grad_op2
+*     |            |
+*  sgd_op1      sgd_op2
+*
+* sgd_op1 and sgd_op2 will update the same weight which holds the same
+* memory, so we could benefits from the acceleration
+*/
+class LockFreeOptimizePass : public Pass {
+ public:
+  virtual ~LockFreeOptimizePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+ private:
+  // Create a new sgd node via current optimizer node
+  ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node,
+                             ir::Node* backward_node, ir::Node* grad_sum_node,
+                             ir::Node* optimize_node) const;
+
+  // Replace the input weight's optimizers
+  void ReplaceUpstreamNode(ir::Node* upstream_node,
+                           ir::Node* old_optimizer_node,
+                           ir::Node* new_optimizer_node) const;
+
+  // Replace the output weight's optimizers
+  void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node,
+                                ir::Node* new_optimizer_node) const;
+
+  // Find all weight variables in graph
+  bool FindAllWeightVars(ir::Graph* graph) const;
+
+  // Find the forward_op node via the backward_op node
+  ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph,
+                                       ir::Node* backward_node) const;
+
+  std::vector<ir::Node*> FindConnectedNode(ir::Node* upstream_node,
+                                           ir::Node* downstream_node) const;
+
+  inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+
+    return node->NodeType() == Node::Type::kOperation && node->Name() == name;
+  }
+
+  inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+
+    return node->NodeType() == Node::Type::kVariable && node->Name() == name;
+  }
+
+  inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+
+    return node->NodeType() == Node::Type::kVariable &&
+           boost::algorithm::ends_with(node->Name(), name);
+  }
+
+  inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+
+    return node->NodeType() == Node::Type::kVariable &&
+           node->Name().find(name) != std::string::npos;
+  }
+
+  inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
+    PADDLE_ENFORCE(ctrl_dep_node);
+    PADDLE_ENFORCE(node);
+
+    return IsControlDepVar(*ctrl_dep_node) &&
+           ctrl_dep_node->inputs.size() >= 1u &&
+           ctrl_dep_node->inputs[0] == node;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+#endif  // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#define MAX_CONCAT_INPUTS 200
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
+                                  const std::string& name_scope,
+                                  int num_inputs) {
+  auto is_concat_op_with_inputs = [](Node* x, int num) -> bool {
+    return x && x->IsOp() && x->Op()->Type() == "concat" &&
+           x->Op()->Input("X").size() == static_cast<size_t>(num);
+  };
+
+  auto is_nth_input_var_of_concat = [=](Node* x, int idx) -> bool {
+    return x && x->IsVar() && VarLinksToOp(x, "concat") &&
+           x->outputs.size() == 1 && IsNthInput(x, x->outputs[0], "X", idx) &&
+           is_concat_op_with_inputs(x->outputs[0], num_inputs);
+  };
+
+  auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=](
+      Node* x, const std::string& type, int idx) -> bool {
+    bool ok = x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
+              x->Op()->HasAttr("pooltype") &&
+              boost::get<std::string>(x->Op()->GetAttr("pooltype")) == type &&
+              x->outputs.size() == 2;  // seqpool should only have 2 outputs
+    if (ok) {
+      // only one output of seqpool_op is nth_input_var of concat
+      // the other one should be unused empty var
+      if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
+        ok = ok && x->outputs[1]->IsVar() && x->outputs[1]->outputs.size() == 0;
+      } else {
+        ok = ok && is_nth_input_var_of_concat(x->outputs[1], idx) &&
+             x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0;
+      }
+    }
+    return ok;
+  };
+
+  auto* concat_op = pattern->NewNode(
+      [=](Node* x) { return is_concat_op_with_inputs(x, num_inputs); },
+      name_scope + "/concat_op");
+  concat_op->assert_op_attr<int>("axis", 1);
+
+  auto* concat_out_var = pattern->NewNode(
+      [=](Node* x) {
+        return x && x->IsVar() && VarLinksFromOp(x, "concat") &&
+               x->inputs.size() == 1 &&
+               is_concat_op_with_inputs(x->inputs[0], num_inputs);
+      },
+      name_scope + "/concat_out_var");
+  concat_out_var->assert_is_only_output_of_op("concat");
+
+  std::vector<PDNode*> seqpool_ops_input_var(num_inputs);
+  std::vector<PDNode*> seqpool_ops_output_var(num_inputs);
+  std::vector<PDNode*> seqpool_ops(num_inputs);
+
+  for (int i = 0; i < num_inputs; ++i) {
+    seqpool_ops_output_var[i] = pattern->NewNode(
+        [=](Node* x) {
+          return x && x->IsVar() && is_nth_input_var_of_concat(x, i) &&
+                 x->inputs.size() == 1 &&
+                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
+                                                                   "SUM", i);
+        },
+        name_scope + "/sequence_pool_out_" + std::to_string(i));
+
+    seqpool_ops[i] = pattern->NewNode(
+        [=](Node* x) {
+          return x && x->IsOp() &&
+                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x, "SUM", i);
+        },
+        name_scope + "/sequence_pool_op_" + std::to_string(i));
+
+    seqpool_ops_input_var[i] = pattern->NewNode(
+        [=](Node* x) {
+          return x && x->IsVar() && x->outputs.size() >= 1 &&
+                 is_seqpool_op_with_pootype_of_nth_input_of_concat(
+                     x->outputs[0], "SUM", i);
+        },
+        name_scope + "/sequence_pool_in_" + std::to_string(i));
+
+    // Links
+    seqpool_ops[i]
+        ->LinksFrom({seqpool_ops_input_var[i]})
+        .LinksTo({seqpool_ops_output_var[i]});
+  }
+  concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var});
+  return concat_out_var;
+}
+
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
+                int num_inputs) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs);
+
+  auto retrieve_node = [](const std::string& name,
+                          const GraphPatternDetector::subgraph_t& subgraph,
+                          const PDPattern& pat) -> Node* {
+    PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
+                   "pattern has no Node called %s", name.c_str());
+    Node* p = subgraph.at(pat.RetrieveNode(name));
+    PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str());
+    return p;
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle SeqPool Concat fuse";
+    std::vector<std::string> input_names(num_inputs);
+    std::vector<Node*> input_vars(num_inputs);
+    auto& fused_pattern = gpd.pattern();
+    for (int i = 0; i < num_inputs; ++i) {
+      input_vars[i] =
+          retrieve_node(name_scope + "/sequence_pool_in_" + std::to_string(i),
+                        subgraph, fused_pattern);
+      input_names[i] = input_vars[i]->Name();
+    }
+    auto* concat_op =
+        retrieve_node(name_scope + "/concat_op", subgraph, fused_pattern);
+    auto* concat_out_var =
+        retrieve_node(name_scope + "/concat_out_var", subgraph, fused_pattern);
+    auto* seqpool_op0 = retrieve_node(name_scope + "/sequence_pool_op_0",
+                                      subgraph, fused_pattern);
+
+    // Create New OpDesc
+    OpDesc op_desc;
+    op_desc.SetType("fusion_seqpool_concat");
+    op_desc.SetInput("X", input_names);
+    op_desc.SetAttr("pooltype", seqpool_op0->Op()->GetAttr("pooltype"));
+    op_desc.SetAttr("axis", concat_op->Op()->GetAttr("axis"));
+    op_desc.SetOutput("Out", {concat_out_var->Name()});
+    auto* op = graph->CreateOpNode(&op_desc);
+    for (size_t i = 0; i < input_vars.size(); ++i) {
+      IR_NODE_LINK_TO(input_vars[i], op);
+    }
+    IR_NODE_LINK_TO(op, concat_out_var);
+
+    std::unordered_set<const Node*> marked_nodes;
+    for (auto& item : subgraph) {
+      marked_nodes.insert(item.second);
+    }
+    for (size_t i = 0; i < input_vars.size(); ++i) {
+      marked_nodes.erase(input_vars[i]);
+    }
+    marked_nodes.erase(concat_out_var);
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> SeqPoolConcatFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  int fusion_count = 0;
+  for (int i = MAX_CONCAT_INPUTS; i > 0; --i) {
+    fusion_count += BuildFusion(
+        graph.get(), name_scope_ + "/" + std::to_string(i), param_scope(), i);
+  }
+  AddStatis(fusion_count);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(seqpool_concat_fuse_pass,
+              paddle::framework::ir::SeqPoolConcatFusePass);
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class SeqPoolConcatFusePass : public FusePassBase {
+ public:
+  virtual ~SeqPoolConcatFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"seqpool_concat_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,

 void NaiveExecutor::Run() {
 #ifndef PADDLE_ON_INFERENCE
-  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
-                              "cmake flag ON_INFER is not set.";
-  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
-                              "variables will be reused to save the allocation "
-                              "overhead.";
-  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
-                              "setting the cmake flag ON_INFER=ON if you are "
-                              "running Paddle Inference";
+  LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
+                             "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
+                             "variables will be reused to save the allocation "
+                             "overhead.";
+  LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
+                             "setting the cmake flag ON_INFER=ON if you are "
+                             "running Paddle Inference";
 #endif  // PADDLE_ON_INFERENCE
  for (auto &op : ops_) {
    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()

--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -32,8 +32,11 @@ std::map<std::string,
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
    NgraphBridge::NG_NODE_MAP = {
        {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
+        {"mean", paddle::operators::ngraphs::BuildMeanNode},
+        {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
        {"mul", paddle::operators::ngraphs::BuildMulNode},
        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
+        {"scale", paddle::operators::ngraphs::BuildScaleNode},
        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
        {"top_k", paddle::operators::ngraphs::BuildTopKNode}};

--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
    }
  }

-  backend_->call(ngraph_function_, t_out, t_in);
+  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
 }  // NgraphEngine::RunImpl
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor(
    const std::unordered_set<std::string> &bcast_vars,
    const ProgramDesc &main_program, const std::string &loss_var_name,
    Scope *scope, const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
-    size_t num_trainers, size_t trainer_id)
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
  member_->use_cuda_ = exec_strategy.use_cuda_;
  member_->build_strategy_ = build_strategy;
  member_->use_all_reduce_ =
      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
-  member_->nranks_ = num_trainers * places.size();
+  member_->nranks_ = build_strategy.num_trainers_ * places.size();

  if (!member_->use_all_reduce_) {
    PADDLE_ENFORCE(places.size() > 1,
@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor(
    }

    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
-        member_->places_, nccl_id, num_trainers, trainer_id));
+        member_->places_, nccl_id, build_strategy.num_trainers_,
+        build_strategy.trainer_id_));
 #else
    PADDLE_THROW("Not compiled with CUDA");
 #endif

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -50,8 +50,7 @@ class ParallelExecutor {
                            const std::string &loss_var_name, Scope *scope,
                            const std::vector<Scope *> &local_scopes,
                            const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy,
-                            size_t num_trainers = 1, size_t trainer_id = 0);
+                            const BuildStrategy &build_strategy);

  ~ParallelExecutor();


--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
 }

 Variable* Scope::Var(std::string* name) {
-  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  SCOPE_VARS_WRITER_LOCK
+  auto new_name = std::to_string(reinterpret_cast<uintptr_t>(this)) + "." +
+                  std::to_string(vars_.size());
  if (name != nullptr) {
    *name = new_name;
  }
-  SCOPE_VARS_WRITER_LOCK
  return VarInternal(new_name);
 }


--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {

 }  // namespace detail

-const std::type_index &ToTypeIndex(int var_id) {
+const std::type_index &VarTraitIdToTypeIndex(int var_id) {
  return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
 }

-const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+const char *ToTypeName(int var_id) {
+  return VarTraitIdToTypeIndex(var_id).name();
+}

-int ToTypeId(const std::type_index &type) {
+int TypeIndexToVarTraitId(const std::type_index &type) {
  return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
 }


--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -66,8 +66,8 @@ namespace paddle {
 namespace framework {

 const char *ToTypeName(int var_id);
-const std::type_index &ToTypeIndex(int var_id);
-int ToTypeId(const std::type_index &type);
+const std::type_index &VarTraitIdToTypeIndex(int var_id);
+int TypeIndexToVarTraitId(const std::type_index &type);

 namespace detail {


--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -45,10 +45,11 @@ struct TypeIndexChecker {
    constexpr auto kId = VarTypeTrait<Type>::kId;
    std::type_index actual_type(typeid(Type));
    EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
-    EXPECT_EQ(ToTypeIndex(kId), actual_type);
-    EXPECT_EQ(ToTypeId(actual_type), kId);
-    EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type);
-    EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
+    EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type);
+    EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId);
+    EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)),
+              actual_type);
+    EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId);

    EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
    EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT

--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -77,6 +77,7 @@ class PreparedOp {
  framework::OperatorWithKernel::OpKernelFunc func;
  platform::DeviceContext* dev_ctx;
 };
+
 class OpBase;

 class VarBase {

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
       i++) {
    LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
              << " result: " << result[i];
-    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
-                   result[i]);
+    EXPECT_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
+                1e-3);
  }
 }


--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -123,8 +123,6 @@ struct Argument {
  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
-  DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
-                      std::function<bool(const framework::ir::Node*)>);
  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
  for (const std::string &pass_name : passes) {
    auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);

-    // Set some pass attributes.
-    if (pass_name == "ir_analysis_pass") {
-      pass->Set("tensorrt_node_teller",
-                new SubgraphDetector::NodeInsideSubgraphTeller(
-                    argument->tensorrt_node_teller()));
-    }
-
    if (pass_name == "graph_viz_pass") {
      std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
                                  (pre_pass.empty() ? "origin" : pre_pass) +
@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
    }

    if (pass_name == "tensorrt_subgraph_pass") {
-      PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
-      pass->SetNotOwned("tensorrt_node_teller",
-                        argument->tensorrt_node_teller_ptr());
      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
      pass->Set("min_subgraph_size",

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
-cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
-set(analysis_deps ${analysis_deps}
-        subgraph_detector tensorrt_subgraph_pass
-        CACHE INTERNAL "")

-set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
-set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
+if (TENSORRT_FOUND)
+  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
+
+  set(analysis_deps ${analysis_deps}
+          subgraph_detector tensorrt_subgraph_pass
+          CACHE INTERNAL "")
+
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+  file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
+endif()
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/tensorrt/op_teller.h"

 namespace paddle {
 namespace inference {
@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    std::unique_ptr<framework::ir::Graph> graph) const {
  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());

-  auto teller =
-      Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
+  auto teller = [](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
+  };

  SubGraphFuser fuser(graph.get(), teller,
                      Get<int>("min_subgraph_size") /*min subgraph size*/);
@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(

 REGISTER_PASS(tensorrt_subgraph_pass,
              paddle::inference::analysis::TensorRtSubgraphPass)
-    .RequirePassAttr("tensorrt_node_teller")
    .RequirePassAttr("max_batch_size")
    .RequirePassAttr("workspace_size")
    .RequirePassAttr("min_subgraph_size");
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
        ir_graph_build_pass
        ir_analysis_pass
        analysis_passes
+        subgraph_detector
        CACHE INTERNAL "")
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -27,9 +27,6 @@ namespace analysis {

 void IrAnalysisComposePass::RunImpl(Argument *argument) {
  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
-  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
-    InitTensorRTAttrs(argument);
-  }
  ApplyIrPasses(argument);
  CollectFusionStatis(argument);
 }
@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
  return "ir-analysis-compose-pass";
 }

-void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
-  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
-    LOG(INFO) << "Initing TensorRT pass";
-    argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
-      std::unordered_set<std::string> teller_set(
-          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
-           "conv2d_transpose", "leaky_relu"});
-      if (!node->IsOp()) return false;
-
-      if (teller_set.count(node->Op()->Type())) {
-        return true;
-      } else {
-        return false;
-      }
-    });
-  }
-}
-
 void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
  std::vector<std::string> passes({
      "ir_graph_build_pass", "ir_analysis_pass",

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
  std::string repr() const override;

 private:
-  void InitTensorRTAttrs(Argument* argument);
-
  void ApplyIrPasses(Argument* argument);

  void CollectFusionStatis(Argument* argument);

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -14,86 +14,101 @@

 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_pass_builder.h"  // NOLINT
+#include "paddle/fluid/platform/gpu_info.h"

 namespace paddle {

 PassStrategy *contrib::AnalysisConfig::pass_builder() const {
-  PADDLE_ENFORCE(
-      pass_builder_.get(),
-      "Should call constructor first, that will init the pass_builder_.");
+  if (!pass_builder_.get()) {
+    if (use_gpu_) {
+      LOG(INFO) << "Create GPU IR passes";
+      pass_builder_.reset(new GpuPassStrategy);
+    } else {
+      LOG(INFO) << "Create CPU IR passes";
+      pass_builder_.reset(new CpuPassStrategy);
+    }
+  } else if (pass_builder_->use_gpu() ^ use_gpu()) {
+    LOG(WARNING) << "The use_gpu flag is not compatible between Config and "
+                    "PassBuilder, the flags are "
+                 << use_gpu() << " " << pass_builder_->use_gpu();
+    LOG(WARNING) << "Please make them compatible, still use the existing "
+                    "PassBuilder.";
+  }
+
  return pass_builder_.get();
 }

-contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) {
-  this->use_gpu = use_gpu;
-  if (use_gpu) {
-    pass_builder_.reset(new GpuPassStrategy);
-  } else {
-    pass_builder_.reset(new CpuPassStrategy);
-  }
+contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
+  model_dir_ = model_dir;
+}
+contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
+                                        const std::string &params_file) {
+  prog_file_ = prog_file;
+  params_file_ = params_file;
+}
+void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
+                                       const std::string &params_file_path) {
+  prog_file_ = prog_file_path;
+  params_file_ = params_file_path;
+}
+void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
+                                           int device_id) {
+#ifdef PADDLE_WITH_CUDA
+  use_gpu_ = true;
+  memory_pool_init_size_mb_ = memory_pool_init_size_mb;
+  device_id_ = device_id;
+#else
+  LOG(ERROR) << "Please compile with gpu to EnableGpu";
+  use_gpu_ = false;
+#endif
 }
+void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }

 contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
-  // fields from Config
-  model_dir = other.model_dir;
-  // fields from NativeConfig
-  use_gpu = other.use_gpu;
-  device = other.device;
-  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
-  prog_file = other.prog_file;
-  param_file = other.param_file;
-  specify_input_name = other.specify_input_name;
-  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
-  // fields from this.
-  enable_ir_optim = other.enable_ir_optim;
-  // For mkldnn
-  use_mkldnn_ = other.use_mkldnn_;
-  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
-
-  use_feed_fetch_ops = other.use_feed_fetch_ops;
-  use_tensorrt_ = other.use_tensorrt_;
-  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
-  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
-  tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
-  model_from_memory_ = other.model_from_memory_;
-
-  if (use_gpu) {
+#define CP_MEMBER(member__) member__ = other.member__;
+
+  // Model related.
+  CP_MEMBER(model_dir_);
+  CP_MEMBER(prog_file_);
+  CP_MEMBER(params_file_);
+  CP_MEMBER(model_from_memory_);  // the memory model reuses prog_file_ and
+                                  // params_file_ fields.
+  // Gpu releated.
+  CP_MEMBER(use_gpu_);
+  CP_MEMBER(device_id_);
+  CP_MEMBER(memory_pool_init_size_mb_);
+  // TensorRT releated.
+  CP_MEMBER(use_tensorrt_);
+  CP_MEMBER(tensorrt_workspace_size_);
+  CP_MEMBER(tensorrt_max_batchsize_);
+  CP_MEMBER(tensorrt_min_subgraph_size_);
+  // MKLDNN releated.
+  CP_MEMBER(use_mkldnn_);
+  CP_MEMBER(mkldnn_enabled_op_types_);
+
+  // Ir related.
+  CP_MEMBER(enable_ir_optim_);
+  CP_MEMBER(use_feed_fetch_ops_);
+  CP_MEMBER(ir_debug_);
+  CP_MEMBER(specify_input_name_);
+
+  CP_MEMBER(cpu_math_library_num_threads_);
+
+  CP_MEMBER(serialized_info_cache_);
+
+  if (use_gpu_) {
    pass_builder_.reset(new GpuPassStrategy(
        *static_cast<GpuPassStrategy *>(other.pass_builder())));
  } else {
    pass_builder_.reset(new CpuPassStrategy(
        *static_cast<CpuPassStrategy *>(other.pass_builder())));
  }
-}

-contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
-  // fields from Config
-  model_dir = other.model_dir;
-  // fields from NativeConfig
-  use_gpu = other.use_gpu;
-  device = other.device;
-  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
-  prog_file = other.prog_file;
-  param_file = other.param_file;
-  specify_input_name = other.specify_input_name;
-  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
-  // fields from this.
-  enable_ir_optim = other.enable_ir_optim;
-  // For mkldnn
-  use_mkldnn_ = other.use_mkldnn_;
-  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
-
-  use_feed_fetch_ops = other.use_feed_fetch_ops;
-  use_tensorrt_ = other.use_tensorrt_;
-  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
-  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
-  tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
-  model_from_memory_ = other.model_from_memory_;
-
-  pass_builder_ = std::move(other.pass_builder_);
+#undef CP_MEMBER
 }

 void contrib::AnalysisConfig::EnableMKLDNN() {
@@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
  use_tensorrt_ = true;
  tensorrt_workspace_size_ = workspace_size;
  tensorrt_max_batchsize_ = max_batch_size;
-  tensorrt_min_subgraph_size_ = min_subgraph_size;
-  // Append after the conv+affine_channel fuse pass.
-  pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
+}
+
+void contrib::AnalysisConfig::Update() {
+  auto info = SerializeInfoCache();
+  if (info == serialized_info_cache_) return;
+
+  if (use_gpu_) {
+    pass_builder_.reset(new GpuPassStrategy);
+  } else {
+    pass_builder_.reset(new CpuPassStrategy);
+  }
+
+  if (use_tensorrt_) {
+    if (!use_gpu_) {
+      LOG(ERROR)
+          << "TensorRT engine is not available when EnableGpu() not actived.";
+    } else {
+      // Append after the infer_clean pass.
+      pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
+    }
+  }
+
+  if (use_mkldnn_) {
+    if (!enable_ir_optim_) {
+      LOG(ERROR)
+          << "EnableMKLDNN() only works when IR optimization is enabled.";
+    }
+#ifdef PADDLE_WITH_MKLDNN
+    pass_builder()->EnableMKLDNN();
+    use_mkldnn_ = true;
+#else
+    LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+    use_mkldnn_ = false;
+#endif
+  }
+
+  if (ir_debug_) {
+    pass_builder()->TurnOnDebug();
+  }
+}
+
+std::string contrib::AnalysisConfig::SerializeInfoCache() {
+  std::stringstream ss;
+  ss << use_gpu_;
+  ss << memory_pool_init_size_mb_;
+
+  ss << use_tensorrt_;
+  ss << tensorrt_workspace_size_;
+  ss << tensorrt_max_batchsize_;
+
+  ss << use_mkldnn_;
+  ss << enable_ir_optim_;
+  ss << use_feed_fetch_ops_;
+  ss << ir_debug_;
+
+  return ss.str();
+}
+
+void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
+    int cpu_math_library_num_threads) {
+  cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+}
+
+float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
+#ifdef PADDLE_WITH_CUDA
+  // Get the GPU memory details and calculate the fraction of memory for the
+  // GPU memory pool.
+  size_t gpu_used, gpu_available;
+  platform::GpuMemoryUsage(&gpu_used, &gpu_available);
+  double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
+  float fraction_of_gpu_memory =
+      static_cast<double>(memory_pool_init_size_mb()) / total_gpu_memory;
+  return fraction_of_gpu_memory;
+#else
+  return 0.;
+#endif
 }

 void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                             size_t prog_buffer_size,
                                             const char *param_buffer,
                                             size_t param_buffer_size) {
-  prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
-  param_file = std::string(param_buffer, param_buffer + param_buffer_size);
+  prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
+  params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
  model_from_memory_ = true;
 }


--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -33,6 +33,7 @@
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(profile);
@@ -59,8 +60,8 @@ bool AnalysisPredictor::Init(
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";
-    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
-                                           : platform::ProfilerState::kCPU;
+    auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
+                                             : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }

@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram(
    // Optimize the program, and load parameters and modify them in the
    // scope_.
    // This will change the scope_ address.
-    if (config_.enable_ir_optim) {
+    if (config_.ir_optim()) {
      status_ir_optim_enabled_ = true;
      OptimizeInferenceProgram();
    } else {
@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram(
  return true;
 }
 bool AnalysisPredictor::CreateExecutor() {
-  if (config_.use_gpu) {
+  if (config_.use_gpu_) {
    status_use_gpu_ = true;
-    place_ = paddle::platform::CUDAPlace(config_.device);
+    place_ = paddle::platform::CUDAPlace(config_.device_id_);
  } else {
    place_ = paddle::platform::CPUPlace();
  }
@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() {
 }
 bool AnalysisPredictor::PrepareExecutor() {
  executor_->Prepare(sub_scope_, *inference_program_, 0,
-                     config_.use_feed_fetch_ops);
+                     config_.use_feed_fetch_ops_);

  PADDLE_ENFORCE_NOT_NULL(sub_scope_);

@@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
    }
    input.set_lod(lod);
    int idx = -1;
-    if (config_.specify_input_name) {
+    if (config_.specify_input_name_) {
      auto name = inputs[i].name;
      if (feed_names_.find(name) == feed_names_.end()) {
        LOG(ERROR) << "feed names from program do not have name: [" << name
@@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 void AnalysisPredictor::OptimizeInferenceProgram() {
  status_program_optimized_ = true;

-  argument_.SetUseGPU(config_.use_gpu);
-  argument_.SetGPUDeviceId(config_.device);
+  argument_.SetUseGPU(config_.use_gpu());
+  argument_.SetGPUDeviceId(config_.gpu_device_id());
  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
-  if (!config_.model_dir.empty()) {
-    argument_.SetModelDir(config_.model_dir);
+  if (!config_.model_dir().empty()) {
+    argument_.SetModelDir(config_.model_dir());
  } else {
    PADDLE_ENFORCE(
-        !config_.param_file.empty(),
+        !config_.params_file().empty(),
        "Either model_dir or (param_file, prog_file) should be set.");
-    PADDLE_ENFORCE(!config_.prog_file.empty());
-    argument_.SetModelProgramPath(config_.prog_file);
-    argument_.SetModelParamsPath(config_.param_file);
+    PADDLE_ENFORCE(!config_.prog_file().empty());
+    argument_.SetModelProgramPath(config_.prog_file());
+    argument_.SetModelParamsPath(config_.params_file());
  }

-  if (config_.use_gpu && config_.use_tensorrt_) {
+  if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
    argument_.SetUseTensorRT(true);
    argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
@@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }

  auto passes = config_.pass_builder()->AllPasses();
-  if (!config_.enable_ir_optim) passes.clear();
+  if (!config_.ir_optim()) passes.clear();
  argument_.SetIrAnalysisPasses(passes);
  argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
  Analyzer().Run(&argument_);
@@ -358,18 +359,26 @@ template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
  VLOG(3) << "create AnalysisConfig";
-  if (config.use_gpu) {
+  if (config.use_gpu()) {
    // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
+    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
+                      config.gpu_device_id());
    std::vector<std::string> flags;
-    if (config.fraction_of_gpu_memory >= 0.0f ||
-        config.fraction_of_gpu_memory <= 0.95f) {
+
+    float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
+    if (fraction_of_gpu_memory > 0.95f) {
+      LOG(ERROR)
+          << "Allocate too much memory for the GPU memory pool, assigned "
+          << config.memory_pool_init_size_mb() << " MB";
+      LOG(ERROR)
+          << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
+    }
+
+    if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
      flags.push_back("dummpy");
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         std::to_string(config.fraction_of_gpu_memory);
+                         std::to_string(fraction_of_gpu_memory);
      flags.push_back(flag);
      VLOG(3) << "set flag: " << flag;
      framework::InitGflags(flags);
@@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() {
 bool AnalysisPredictor::LoadProgramDesc() {
  // Initialize the inference program
  std::string filename;
-  if (!config_.model_dir.empty()) {
-    filename = config_.model_dir + "/__model__";
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+  if (!config_.model_dir().empty()) {
+    filename = config_.model_dir() + "/__model__";
+  } else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
    // All parameters are saved in a single file.
    // The file names should be consistent with that used
    // in Python API `fluid.io.save_inference_model`.
-    filename = config_.prog_file;
+    filename = config_.prog_file();
  } else {
-    if (config_.model_dir.empty() && config_.prog_file.empty()) {
+    if (config_.model_dir().empty() && config_.prog_file().empty()) {
      LOG(ERROR)
          << "Either model_dir or (prog_file, param_file) should be set.";
      return false;
    }
    LOG(ERROR) << string::Sprintf(
-        "not valid model path '%s' or program path '%s'.", config_.model_dir,
-        config_.param_file);
+        "not valid model path '%s' or program path '%s'.", config_.model_dir(),
+        config_.params_file());
    return false;
  }

@@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() {

    proto.ParseFromString(pb_content);
  } else {
-    proto.ParseFromString(config_.prog_file);
+    proto.ParseFromString(config_.prog_file());
  }
  inference_program_.reset(new framework::ProgramDesc(proto));
  return true;
@@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() {
      new_var->SetLoDLevel(var->GetLoDLevel());
      new_var->SetPersistable(true);

-      if (!config_.param_file.empty()) {
+      if (!config_.params_file().empty()) {
        params.push_back(new_var->Name());
      } else {
        // append_op
        framework::OpDesc *op = load_block->AppendOp();
        op->SetType("load");
        op->SetOutput("Out", {new_var->Name()});
-        op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()});
+        op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
        op->CheckAttrs();
      }
    }
  }

-  if (!config_.param_file.empty()) {
+  if (!config_.params_file().empty()) {
    // sort paramlist to have consistent ordering
    std::sort(params.begin(), params.end());
    // append just the load_combine op
    framework::OpDesc *op = load_block->AppendOp();
    op->SetType("load_combine");
    op->SetOutput("Out", params);
-    op->SetAttr("file_path", {config_.param_file});
+    op->SetAttr("file_path", {config_.params_file()});
    op->CheckAttrs();
  }


--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc;
 using framework::NaiveExecutor;
 using contrib::AnalysisConfig;

-/* This predictor is based on the original native predictor with IR and Analysis
- * support. It will optimize IR and Parameters in the runtime.
+/** \brief This predictor is based on the original native predictor with IR and
+ * Analysis support.
+ *
+ * It will optimize IR and Parameters in the runtime.
+ *
 * TODO(Superjomn) Replace the Navive predictor?
 */
 class AnalysisPredictor : public PaddlePredictor {

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -25,9 +25,9 @@ namespace paddle {
 using contrib::AnalysisConfig;

 TEST(AnalysisPredictor, analysis_off) {
-  AnalysisConfig config(false);
-  config.model_dir = FLAGS_dirname;
-  config.enable_ir_optim = false;
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(false);

  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) {
 }

 TEST(AnalysisPredictor, analysis_on) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(true);
 #ifdef PADDLE_WITH_CUDA
-  AnalysisConfig config(true);
-  config.fraction_of_gpu_memory = 0.15;
+  config.EnableUseGpu(100, 0);
 #else
-  AnalysisConfig config;
+  config.DisableGpu();
 #endif
-  config.model_dir = FLAGS_dirname;
-  config.enable_ir_optim = true;

  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) {
  }

  // compare with NativePredictor
-  auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  auto naive_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
  std::vector<PaddleTensor> naive_outputs;
  ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
  ASSERT_EQ(naive_outputs.size(), 1UL);
@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) {

 TEST(AnalysisPredictor, ZeroCopy) {
  AnalysisConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_feed_fetch_ops = false;
-
+  config.SetModel(FLAGS_dirname);
+  config.SwitchUseFeedFetchOps(false);
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);

  auto w0 = predictor->GetInputTensor("firstw");
@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) {

 TEST(AnalysisPredictor, Clone) {
  AnalysisConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_feed_fetch_ops = true;
-  config.enable_ir_optim = true;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchUseFeedFetchOps(true);
+  config.SwitchIrOptim(true);

  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
  predictors.emplace_back(CreatePaddlePredictor(config));

--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -19,8 +19,6 @@ limitations under the License. */

 #pragma once

-#define WITH_ANAKIN
-
 #include <vector>

 #include "framework/core/net/net.h"

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
  VLOG(3) << "create NativePaddlePredictor";
  if (config.use_gpu) {
    // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
+    PADDLE_ENFORCE_GE(
        config.fraction_of_gpu_memory, 0.f,
        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) {
 #endif

 TEST(PassBuilder, Delete) {
-  contrib::AnalysisConfig config(false);
+  contrib::AnalysisConfig config;
+  config.DisableGpu();
  config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
  const auto& passes = config.pass_builder()->AllPasses();
  auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -116,6 +116,10 @@ D
      --modeldir=$DATA_DIR/mobilenet/model \
      --data=$DATA_DIR/mobilenet/data.txt \
      --refer=$DATA_DIR/mobilenet/result.txt 
+    if [ $? -ne 0 ]; then
+      echo "trt demo trt_mobilenet_demo runs fail."
+      exit 1
+    fi
  fi
 done
 set +x
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -36,12 +36,11 @@ namespace demo {
 */
 void Main() {
  std::unique_ptr<PaddlePredictor> predictor;
-  paddle::contrib::AnalysisConfig config(true);
-  config.param_file = FLAGS_modeldir + "/__params__";
-  config.prog_file = FLAGS_modeldir + "/__model__";
-  config.device = 0;
+  paddle::contrib::AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(FLAGS_modeldir + "/__model__",
+                  FLAGS_modeldir + "/__params__");
  config.EnableTensorRtEngine();
-  config.fraction_of_gpu_memory = 0.1;  // set by yourself
  predictor = CreatePaddlePredictor(config);

  VLOG(3) << "begin to process data";

--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -40,15 +40,14 @@ using contrib::AnalysisConfig;
 */
 void Main(bool use_gpu) {
  std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
-  AnalysisConfig config(use_gpu);
-  config.param_file = FLAGS_modeldir + "/__params__";
-  config.prog_file = FLAGS_modeldir + "/__model__";
-  config.device = 0;
-  if (FLAGS_use_gpu) {
-    config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  AnalysisConfig config;
+  if (use_gpu) {
+    config.EnableUseGpu(100, 0);
  }
+  config.SetModel(FLAGS_modeldir + "/__model__",
+                  FLAGS_modeldir + "/__params__");

-  predictor = CreatePaddlePredictor<NativeConfig>(config);
+  predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
  analysis_predictor = CreatePaddlePredictor(config);

  // Just a single batch of data.

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -19,6 +19,8 @@
 #include <unordered_set>
 #include <vector>

+/*! \file */
+
 // Here we include some header files with relative paths, for that in deploy,
 // the abstract path of this header file will be changed.
 #include "paddle_api.h"           // NOLINT
@@ -34,41 +36,188 @@ class AnalysisPredictor;
 namespace contrib {

 // NOTE WIP, not stable yet.
-struct AnalysisConfig : public NativeConfig {
-  explicit AnalysisConfig(bool use_gpu = false);
+struct AnalysisConfig {
+  AnalysisConfig() = default;
  explicit AnalysisConfig(const AnalysisConfig& other);
-  explicit AnalysisConfig(AnalysisConfig&& other);
+  explicit AnalysisConfig(const std::string& model_dir);
+  explicit AnalysisConfig(const std::string& prog_file,
+                          const std::string& params_file);

-  // Determine whether to perform graph optimization.
-  bool enable_ir_optim = true;
+  /** Set model with a directory.
+   */
+  void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
+  /** Set model with two specific pathes for program and parameters.
+   */
+  void SetModel(const std::string& prog_file_path,
+                const std::string& params_file_path);
+  /** Set program file path.
+   */
+  void SetProgFile(const std::string& x) { prog_file_ = x; }
+  /** Set parameter composed file path.
+   */
+  void SetParamsFile(const std::string& x) { params_file_ = x; }
+  /** Get the model directory path.
+   */
+  const std::string& model_dir() const { return model_dir_; }
+  /** Get the program file path.
+   */
+  const std::string& prog_file() const { return prog_file_; }
+  /** Get the composed parameters file.
+   */
+  const std::string& params_file() const { return params_file_; }

-  // Get a pass builder for customize the passes in IR analysis phase.
-  PassStrategy* pass_builder() const;
+  // GPU related.
+
+  /**
+   * \brief Turn on GPU.
+   * @param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
+   * @param device_id the GPU card to use (default is 0).
+   */
+  void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
+  /** Turn off the GPU.
+   */
+  void DisableGpu();
+  /** A bool state telling whether the GPU is turned on.
+   */
+  bool use_gpu() const { return use_gpu_; }
+  /** Get the GPU device id.
+   */
+  int gpu_device_id() const { return device_id_; }
+  /** Get the initial size in MB of the GPU memory pool.
+   */
+  int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
+  /** Get the proportion of the initial memory pool size compared to the device.
+   */
+  float fraction_of_gpu_memory_for_pool() const;
+
+  /** \brief Control whether to perform IR graph optimization.
+   *
+   * If turned off, the AnalysisConfig will act just like a NativeConfig.
+   */
+  void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
+  /** A boolean state tell whether the ir graph optimization is actived.
+   */
+  bool ir_optim() const { return enable_ir_optim_; }

-  // NOT stable yet.
-  bool use_feed_fetch_ops{true};
+  /** \brief INTERNAL Determine whether to use the feed and fetch operators.
+   * Just for internal development, not stable yet.
+   * When ZeroCopyTensor is used, this should turned off.
+   */
+  void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
+  /** A boolean state telling whether to use the feed and fetch operators.
+   */
+  bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }

+  /** \brief Control whether to specify the inputs' names.
+   *
+   * The PaddleTensor type has a `name` member, assign it with the corresponding
+   * variable name. This is used only when the input PaddleTensors passed to the
+   * `PaddlePredictor.Run(...)` cannot follow the order in the training phase.
+   */
+  void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
+
+  /** A boolean state tell whether the input PaddleTensor names specified should
+   * be used to reorder the inputs in `PaddlePredictor.Run(...)`.
+   */
+  bool specify_input_name() const { return specify_input_name_; }
+
+  /**
+   * \brief Turn on the TensorRT engine.
+   *
+   * The TensorRT engine will accelerate some subgraphes in the original Fluid
+   * computation graph. In some models such as TensorRT50, GoogleNet and so on,
+   * it gains significant performance acceleration.
+   *
+   * @param workspace_size the memory size(in byte) used for TensorRT workspace.
+   * @param max_batch_size the maximum batch size of this prediction task,
+   * better set as small as possible, or performance loss.
+   * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
+   * subgraph is less than this, it will not transfer to TensorRT engine.
+   */
  void EnableTensorRtEngine(int workspace_size = 1 << 20,
                            int max_batch_size = 1, int min_subgraph_size = 3);
-  bool use_tensorrt() const { return use_tensorrt_; }
+  /** A boolean state telling whether the TensorRT engine is used.
+   */
+  bool tensorrt_engine_enabled() const { return use_tensorrt_; }

+  /** Control whther to debug IR graph analysis phase.
+   */
+  void SwitchIrDebug(int x = true) { ir_debug_ = x; }
+
+  /** Turn on MKLDNN.
+   */
  void EnableMKLDNN();
-  bool use_mkldnn() const { return use_mkldnn_; }
+  /** A boolean state telling whether to use the MKLDNN.
+   */
+  bool mkldnn_enabled() const { return use_mkldnn_; }
+
+  /** Set and get the number of cpu math library threads.
+   */
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
+  /** An int state telling how many threads are used in the CPU math library.
+   */
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+  /** Transform the AnalysisConfig to NativeConfig.
+   */
+  NativeConfig ToNativeConfig() const {
+    NativeConfig config;
+    config.model_dir = model_dir_;
+    config.prog_file = prog_file_;
+    config.param_file = params_file_;
+    config.use_gpu = use_gpu_;
+    config.device = device_id_;
+    config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
+    config.specify_input_name = specify_input_name_;
+    return config;
+  }
+  /** Specify the operator type list to use MKLDNN acceleration.
+   * @param op_list the operator type list.
+   */
  void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
    mkldnn_enabled_op_types_ = op_list;
  }

-  // Specify the memory buffer of program and parameter
+  /** Specify the memory buffer of program and parameter
+   * @param prog_buffer the memory buffer of program.
+   * @param prog_buffer_size the size of the data.
+   * @param params_buffer the memory buffer of the composed parameters file.
+   * @param params_buffer_size the size of the commposed parameters data.
+   */
  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
-                      const char* program_buffer, size_t program_buffer_size);
+                      const char* params_buffer, size_t params_buffer_size);
+  /** A boolean state telling whether the model is set from the CPU memory.
+   */
  bool model_from_memory() const { return model_from_memory_; }

  friend class ::paddle::AnalysisPredictor;

+  /** NOTE just for developer, not an official API, easily to be broken.
+   * Get a pass builder for customize the passes in IR analysis phase.
+   */
+  PassStrategy* pass_builder() const;
+
 protected:
+  // Update the config.
+  void Update();
+
+  std::string SerializeInfoCache();
+
+ protected:
+  // Model pathes.
+  std::string model_dir_;
+  std::string prog_file_;
+  std::string params_file_;
+
+  // GPU releated.
+  bool use_gpu_{false};
+  int device_id_{0};
+  uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
+
+  // TensorRT releated.
  bool use_tensorrt_{false};
-  bool use_mkldnn_{false};
-  std::unordered_set<std::string> mkldnn_enabled_op_types_;
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
  int tensorrt_workspace_size_;
@@ -82,17 +231,24 @@ struct AnalysisConfig : public NativeConfig {
  //  We set this variable to control the minimum number of nodes in the
  //  subgraph, 3 as default value.
  int tensorrt_min_subgraph_size_{3};
-  std::unique_ptr<PassStrategy> pass_builder_;
+
+  bool use_mkldnn_{false};
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
+
  bool model_from_memory_{false};
-};

-// Configurations for Anakin engine.
-struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86 };
-  int device;
-  std::string model_file;
-  int max_batch_size{-1};
-  TargetType target_type;
+  bool enable_ir_optim_{true};
+  bool use_feed_fetch_ops_{true};
+  bool ir_debug_{false};
+
+  bool specify_input_name_{false};
+
+  int cpu_math_library_num_threads_{1};
+
+  // A runtime cache, shouldn't be transferred to others.
+  std::string serialized_info_cache_;
+
+  mutable std::unique_ptr<PassStrategy> pass_builder_;
 };

 }  // namespace contrib

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -13,61 +13,76 @@
 // limitations under the License.
 #pragma once

+/*! \file paddle_api.h
+ */
+
 #include <cassert>
 #include <memory>
 #include <string>
 #include <vector>

+/*! \namespace paddle
+ */
 namespace paddle {

-// Data type.
+/** paddle data type.
+ */
 enum PaddleDType {
  FLOAT32,
  INT64,
  // TODO(Superjomn) support more data types if needed.
 };

-/*
- * Memory menage for PaddleTensor.
- * The PaddleBuf holds a buffer for data input or output. The memory can be
- * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- * should be reused for better performance.
+/**
+ *\brief Memory menager for PaddleTensor.
 *
- * For user allocated memory, the following API can be used:
- * - PaddleBuf(void* data, size_t length) to set an external memory by
- * specifying
- *   the memory address and length.
- * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
- * memory.
- * ATTENTION, for user allocated memory, deallocation should be done by users
- * externally after the program finished. The PaddleBuf won't do any allocation
- * or deallocation.
+ *The PaddleBuf holds a buffer for data input or output. The memory can be
+ *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ *should be reused for better performance.
 *
- * To have the PaddleBuf allocate and manage the memory:
- * - PaddleBuf(size_t length) will allocate a memory of size `length`.
- * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
- *   if the allocated memory is larger than `length`, nothing will done.
+ *For user allocated memory, the following API can be used:
+ *- PaddleBuf(void* data, size_t length) to set an external memory by
+ *specifying
+ *  the memory address and length.
+ *- Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ *memory.
+ *ATTENTION, for user allocated memory, deallocation should be done by users
+ *externally after the program finished. The PaddleBuf won't do any allocation
+ *or deallocation.
+ *
+ *To have the PaddleBuf allocate and manage the memory:
+ *- PaddleBuf(size_t length) will allocate a memory of size `length`.
+ *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ *  if the allocated memory is larger than `length`, nothing will done.
 */
 class PaddleBuf {
 public:
-  // PaddleBuf allocate memory internally, and manage it.
+  /** PaddleBuf allocate memory internally, and manage it.
+   */
  explicit PaddleBuf(size_t length)
      : data_(new char[length]), length_(length), memory_owned_(true) {}
-  // Set external memory, the PaddleBuf won't manage it.
+  /** Set external memory, the PaddleBuf won't manage it.
+   */
  PaddleBuf(void* data, size_t length)
      : data_(data), length_(length), memory_owned_{false} {}
-  // Copy only available when memory is managed externally.
+  /** Copy only available when memory is managed externally.
+   */
  explicit PaddleBuf(const PaddleBuf&);

-  // Resize the memory.
+  /** Resize the memory.
+   */
  void Resize(size_t length);
-  // Reset to external memory, with address and length set.
+  /** Reset to external memory, with address and length set.
+   */
  void Reset(void* data, size_t length);
-  // Tell whether the buffer is empty.
+  /** Tell whether the buffer is empty.
+   */
  bool empty() const { return length_ == 0; }
-  // Get the memory address.
+  /** Get the memory address.
+   */
  void* data() const { return data_; }
-  // Get the memory length.
+  /** Get the memory length.
+   */
  size_t length() const { return length_; }

  ~PaddleBuf() { Free(); }
@@ -83,7 +98,8 @@ class PaddleBuf {
  bool memory_owned_{true};
 };

-// Basic input and output data structure for PaddlePredictor.
+/** Basic input and output data structure for PaddlePredictor.
+ */
 struct PaddleTensor {
  PaddleTensor() = default;
  std::string name;  // variable name.
@@ -94,19 +110,22 @@ struct PaddleTensor {
 };

 enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-// Tensor without copy, currently only supports AnalysisPredictor.
+/** Tensor without copy, currently only supports AnalysisPredictor.
+ */
 class ZeroCopyTensor {
 public:
  void Reshape(const std::vector<int>& shape);

-  // Get the memory in CPU or GPU with specific data type, should Reshape first
-  // to tell the data size.
-  // Once can directly call this data to feed the data.
-  // This is for write the input tensor.
+  /** Get the memory in CPU or GPU with specific data type, should Reshape first
+   * to tell the data size.
+   * Once can directly call this data to feed the data.
+   * This is for write the input tensor.
+   */
  template <typename T>
  T* mutable_data(PaddlePlace place);
-  // Get the memory directly, will return the place and memory size by pointer.
-  // This is for reading the output tensor.
+  /** Get the memory directly, will return the place and memory size by pointer.
+   * This is for reading the output tensor.
+   */
  template <typename T>
  T* data(PaddlePlace* place, int* size) const;

@@ -128,8 +147,7 @@ class ZeroCopyTensor {
  void* scope_{nullptr};
 };

-/*
- * A simple Inference API for Paddle.
+/** A simple Inference API for Paddle.
 */
 class PaddlePredictor {
 public:
@@ -138,18 +156,20 @@ class PaddlePredictor {
  PaddlePredictor(const PaddlePredictor&) = delete;
  PaddlePredictor& operator=(const PaddlePredictor&) = delete;

-  // Predict an record.
-  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be available until Run returns. Caller should be
-  // responsible for the output tensor's buffer, either allocated or passed from
-  // outside.
+  /** Predict an record.
+   * The caller should be responsible for allocating and releasing the memory of
+   * `inputs`. `inputs` should be available until Run returns. Caller should be
+   * responsible for the output tensor's buffer, either allocated or passed from
+   * outside.
+   */
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;

-  // Zero copy input and output optimization.
-  // Get the input or output tensors, and operate on their memory directly,
-  // without copy.
+  /** Zero copy input and output optimization.
+   * Get the input or output tensors, and operate on their memory directly,
+   * without copy.
+   */
  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
      const std::string& name) {
    return nullptr;
@@ -160,16 +180,19 @@ class PaddlePredictor {
  }
  virtual bool ZeroCopyRun() { return false; }

-  // Clone a predictor that share the model weights, the Cloned predictor should
-  // be thread-safe.
+  /** Clone a predictor that share the model weights, the Cloned predictor
+   * should be thread-safe.
+   */
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;

-  // Destroy the Predictor.
+  /** Destroy the Predictor.
+   */
  virtual ~PaddlePredictor() = default;

-  // The common configs for all the predictors.
+  /** The common configs for all the predictors.
+   */
  struct Config {
-    std::string model_dir;  // path to the model directory.
+    std::string model_dir; /*!< path to the model directory. */
  };
 };

@@ -177,17 +200,21 @@ struct NativeConfig : public PaddlePredictor::Config {
  // GPU related fields.
  bool use_gpu{false};
  int device{0};
-  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
+  float fraction_of_gpu_memory{
+      -1.f}; /*!< Change to a float in (0,1] if needed. */

  // Specify the exact path of program and parameter files.
  std::string prog_file;
  std::string param_file;

-  // Specify the variable's name of each input if input tensors don't follow the
-  // `feeds` and `fetches` of the phase `save_inference_model`.
+  /** Specify the variable's name of each input if input tensors don't follow
+   * the
+   * `feeds` and `fetches` of the phase `save_inference_model`.
+   */
  bool specify_input_name{false};

-  // Set and get the number of cpu math library threads.
+  /** Set and get the number of cpu math library threads.
+   */
  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
  }
@@ -201,28 +228,33 @@ struct NativeConfig : public PaddlePredictor::Config {
  int cpu_math_library_num_threads_{1};
 };

-// A factory to help create different predictors.
-//
-// Usage:
-//
-// NativeConfig config;
-// ... // change the configs.
-// auto native_predictor = CreatePaddlePredictor(config);
-//
-// FOR EXTENSION DEVELOPER:
-// Different predictors are designated by config type. Similar configs can be
-// merged, but there shouldn't be a huge config containing different fields for
-// more than one kind of predictors.
+/*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
+ * config);
+ *
+ * \brief A factory to help create different predictors.
+ *
+ * Usage:
+ *
+ * NativeConfig config;
+ * ... // change the configs.
+ * auto native_predictor = CreatePaddlePredictor(config);
+ *
+ * FOR EXTENSION DEVELOPER:
+ * Different predictors are designated by config type. Similar configs can be
+ * merged, but there shouldn't be a huge config containing different fields for
+ * more than one kind of predictors.
+ */
 template <typename ConfigT>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);

-// NOTE The following APIs are too trivial, we will discard it in the following
-// versions.
+/** NOTE The following APIs are too trivial, we will discard it in the following
+ * versions.
+ */
 enum class PaddleEngineKind {
-  kNative = 0,         // Use the native Fluid facility.
-  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  kAnalysis,           // More optimization.
-  kAnakin              // Use Anakin for inference, not mature yet.
+  kNative = 0,        /*!< Use the native Fluid facility. */
+  kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */
+  kAnalysis,          /*!< More optimization. */
+  kAnakin             /*!< Use Anakin for inference, not mature yet. */
 };

 template <typename ConfigT, PaddleEngineKind engine>

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -26,9 +26,8 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include "paddle_api.h"  // NOLINT
-#ifndef WITH_ANAKIN
 #include "paddle_analysis_config.h"  // NOLINT
-#else
+#include "paddle_api.h"              // NOLINT
+#ifdef WITH_ANAKIN
 #include "paddle_anakin_config.h"  // NOLINT
 #endif
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -18,30 +18,39 @@
 #include <string>
 #include <vector>

+/*! \file */
+
+/*! \namespace paddle */
 namespace paddle {
-/*
- * This is a pass builder based on string. It is part of inference API.
+
+/** This is a pass builder based on string. It is part of inference API.
 */
 class PaddlePassBuilder {
 public:
  explicit PaddlePassBuilder(const std::vector<std::string> &passes)
      : passes_(passes) {}

+  /** Append a pass to the end of the passes. */
  void AppendPass(const std::string &pass_type);

+  /** Insert a pass to a specific position.
+   * @param idx the position to insert.
+   * @param pass_type the pass key.
+   */
  void InsertPass(size_t idx, const std::string &pass_type);

-  // Delete the `idx`-th pass.
+  /** Delete the `idx`-th pass. */
  void DeletePass(size_t idx);

-  // Delete all the passes that has type `pass_type`.
+  /** Delete all the passes that has type `pass_type`. */
  void DeletePass(const std::string &pass_type);

-  // Visualize the computation graph after each pass by generating a DOT
-  // language file, one can draw them with the Graphviz toolkit.
+  /** Visualize the computation graph after each pass by generating a DOT
+   * language file, one can draw them with the Graphviz toolkit.
+   */
  void TurnOnDebug();

-  // Human-readible information.
+  /** Human-readible information. */
  std::string DebugString();

  const std::vector<std::string> &AllPasses() const { return passes_; }
@@ -50,23 +59,27 @@ class PaddlePassBuilder {
  std::vector<std::string> passes_;
 };

-/*
- * Pass strategy to help control the IR passes.
+/**Pass strategy to help control the IR passes.
 */
 class PassStrategy : public PaddlePassBuilder {
 public:
  explicit PassStrategy(const std::vector<std::string> &passes)
      : PaddlePassBuilder(passes) {}

-  // The MKLDNN control exists in both CPU and GPU mode, because there can be
-  // still some CPU kernels running in CPU mode.
+  /** The MKLDNN control exists in both CPU and GPU mode, because there can be
+   * still some CPU kernels running in CPU mode.
+   */
  virtual void EnableMKLDNN() = 0;

+  bool use_gpu() const { return use_gpu_; }
+
  virtual ~PassStrategy() = default;
+
+ protected:
+  bool use_gpu_{false};
 };

-/*
- * The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
+/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
 */
 class CpuPassStrategy : public PassStrategy {
 public:
@@ -76,6 +89,7 @@ class CpuPassStrategy : public PassStrategy {
    passes_.assign({
        "infer_clean_graph_pass",         //
        "attention_lstm_fuse_pass",       //
+        "seqpool_concat_fuse_pass",       //
        "seqconv_eltadd_relu_fuse_pass",  //
        // "embedding_fc_lstm_fuse_pass", //
        "fc_lstm_fuse_pass",             //
@@ -88,6 +102,7 @@ class CpuPassStrategy : public PassStrategy {
        "conv_eltwiseadd_bn_fuse_pass",  //
        "is_test_pass",                  //
    });
+    use_gpu_ = false;
  }

  virtual ~CpuPassStrategy() = default;
@@ -111,8 +126,7 @@ class CpuPassStrategy : public PassStrategy {
  CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
 };

-/*
- * The GPU passes strategy, it is used in
+/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
 */
 class GpuPassStrategy : public PassStrategy {
 public:
@@ -126,10 +140,14 @@ class GpuPassStrategy : public PassStrategy {
        "conv_elementwise_add2_act_fuse_pass",       //
        "conv_elementwise_add_fuse_pass",            //
    });
+
+    use_gpu_ = true;
  }

  GpuPassStrategy(const GpuPassStrategy &other)
-      : PassStrategy(other.AllPasses()) {}
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = true;
+  }

  void EnableMKLDNN() override;


--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
 nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
+nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)

--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/op_teller.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Just tell by the op_types.
+struct SimpleOpTypeSetTeller : public Teller {
+  SimpleOpTypeSetTeller() {}
+
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& desc) override {
+    return teller_set.count(op_type);
+  }
+
+ private:
+  std::unordered_set<std::string> teller_set{
+      {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+       "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+       "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+       "conv2d_transpose", "leaky_relu"}};
+};
+
+bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
+  for (auto& teller : tellers_) {
+    if ((*teller)(op_type, desc)) return true;
+  }
+  return false;
+}
+
+OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Single Op teller definition.
+ * One can override this and define a more complex tell logic, considerring more
+ * issues such as op_desc.
+ */
+struct Teller {
+  virtual bool operator()(const std::string& op_type,
+                          const framework::OpDesc& desc) = 0;
+
+  virtual ~Teller() = default;
+};
+/*
+ * A real example:
+ *
+ * struct SomeTeller : public Teller {
+ * bool operator()(const std::string& op_type,
+ *                const framework::OpDesc& desc) override {
+ *  return op_type == "fc" && desc.Inputs().size() == 2;
+ * }
+ *};
+ */
+
+/*
+ * class OpTeller helps to tell whether a fluid
+ * operator can be transformed to a TensorRT layer.
+ */
+class OpTeller {
+ public:
+  static OpTeller& Global() {
+    static std::unique_ptr<OpTeller> x(new OpTeller);
+    return *x;
+  }
+
+  bool Tell(const std::string& op_type, const framework::OpDesc& desc);
+
+ private:
+  OpTeller();
+
+ private:
+  std::vector<std::unique_ptr<Teller>> tellers_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -41,7 +41,7 @@ endfunction()
 if(NOT APPLE AND WITH_MKLML)
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
+    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL)
 else()
    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)

 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
 download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
+        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL)

 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose

 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
-  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
+  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)

 # mobilenet with depthwise_conv op
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
-  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
+  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)

 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI

--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }

 void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim(true);
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -105,11 +105,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }

 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -76,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }

 void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
    cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
                        buffer_param.size());
  } else {
-    cfg->prog_file = FLAGS_infer_model + "/__model__";
-    cfg->param_file = FLAGS_infer_model + "/param";
+    cfg->SetModel(FLAGS_infer_model + "/__model__",
+                  FLAGS_infer_model + "/param");
  }
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -21,12 +21,10 @@ namespace inference {
 namespace analysis {

 void SetConfig(AnalysisConfig *cfg) {
-  cfg->param_file = FLAGS_infer_model + "/params";
-  cfg->prog_file = FLAGS_infer_model + "/model";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->enable_ir_optim = true;
-  cfg->specify_input_name = true;
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim();
+  cfg->SwitchSpecifyInputNames();
  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }


--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
 }

 void SetConfig(AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

 // Easy for profiling independently.
 TEST(Analyzer_rnn1, profile) {
-  contrib::AnalysisConfig cfg(false);
+  contrib::AnalysisConfig cfg;
  SetConfig(&cfg);
-  cfg.fraction_of_gpu_memory = 0.1;
-  cfg.pass_builder()->TurnOnDebug();
+  cfg.DisableGpu();
+  cfg.SwitchIrDebug();
  std::vector<PaddleTensor> outputs;

  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -285,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, 4 /* multi_thread */);
+                 input_slots_all, &outputs, 2 /* multi_thread */);
 }

 // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) {
 TEST(Analyzer_rnn1, ZeroCopy) {
  AnalysisConfig config;
  SetConfig(&config);
-  config.use_feed_fetch_ops = false;
+  config.SwitchUseFeedFetchOps(false);

  PaddlePlace place;

  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);

-  config.use_feed_fetch_ops = true;
-  auto native_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.SwitchUseFeedFetchOps(true);
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());

-  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
+  config.SwitchUseFeedFetchOps(
+      true);  // the analysis predictor needs feed/fetch.
  auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);

 #define NEW_TENSOR(name__) \
@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) {
 TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
  AnalysisConfig config;
  SetConfig(&config);
-  config.use_feed_fetch_ops = false;
+  config.SwitchUseFeedFetchOps(false);

 #define NEW_TENSOR(name__) \
  auto name__##_tensor = predictor->GetInputTensor(#name__);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }

 void SetConfig(AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -89,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }

 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -122,12 +122,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
 }

 void SetConfig(AnalysisConfig *cfg) {
-  cfg->param_file = FLAGS_infer_model + "/params";
-  cfg->prog_file = FLAGS_infer_model + "/model";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->enable_ir_optim = true;
-  cfg->specify_input_name = true;
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
  cfg->pass_builder()->TurnOnDebug();
  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
@@ -180,8 +177,12 @@ TEST(Analyzer_seq_pool1, fuse_statis) {
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
  auto fuse_statis = GetFuseStatis(
      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+
+  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
+  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
+
  LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 349);
+  EXPECT_EQ(num_ops, 195);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -47,11 +47,10 @@ struct DataReader {
 };

 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) {
 }

 void SetConfig(AnalysisConfig *cfg) {
-  cfg->param_file = FLAGS_infer_model + "/__params__";
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->enable_ir_optim = true;
-  cfg->specify_input_name = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__",
+                FLAGS_infer_model + "/__params__");
+  cfg->DisableGpu();
+  cfg->SwitchIrDebug();
+  cfg->SwitchSpecifyInputNames();
  // TODO(TJ): fix fusion gru
  cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }

--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -62,21 +62,25 @@ std::ostream &operator<<(std::ostream &os,
                         const contrib::AnalysisConfig &config) {
  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
  num_spaces++;
-  os << *reinterpret_cast<const NativeConfig *>(&config);
+  os << config.ToNativeConfig();
  if (!config.model_from_memory()) {
-    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
-    os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
+    os << GenSpaces(num_spaces) << "param_file: " << config.params_file()
+       << "\n";
  } else {
    os << GenSpaces(num_spaces)
       << "prog_file and param_file: load from memory \n";
  }
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
     << "\n";
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
+     << "\n";
+  os << GenSpaces(num_spaces)
+     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
  os << GenSpaces(num_spaces)
-     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
-  os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
+     << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
+  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
     << "\n";
-  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
  num_spaces--;
  os << GenSpaces(num_spaces) << "}\n";
  return os;

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -54,11 +54,13 @@ namespace paddle {
 namespace inference {

 void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
+  const auto *analysis_config =
+      reinterpret_cast<const contrib::AnalysisConfig *>(config);
  if (use_analysis) {
-    LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
+    LOG(INFO) << *analysis_config;
    return;
  }
-  LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config);
+  LOG(INFO) << analysis_config->ToNativeConfig();
 }

 void CompareResult(const std::vector<PaddleTensor> &outputs,
@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,

 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
    const PaddlePredictor::Config *config, bool use_analysis = true) {
+  const auto *analysis_config =
+      reinterpret_cast<const contrib::AnalysisConfig *>(config);
  if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig>(
-        *(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(*analysis_config);
  }
-  return CreatePaddlePredictor<NativeConfig>(
-      *(reinterpret_cast<const NativeConfig *>(config)));
+  auto native_config = analysis_config->ToNativeConfig();
+  return CreatePaddlePredictor<NativeConfig>(native_config);
 }

 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
                                        std::string model_dir, bool use_gpu,
                                        bool use_tensorrt, int batch_size) {
  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
-    config->param_file = model_dir + "/" + FLAGS_param_filename;
+    config->SetModel(model_dir + "/" + FLAGS_prog_filename,
+                     model_dir + "/" + FLAGS_param_filename);
  } else {
-    config->model_dir = model_dir;
+    config->SetModel(model_dir);
  }
  if (use_gpu) {
-    config->use_gpu = true;
-    config->device = 0;
-    config->fraction_of_gpu_memory = 0.15;
+    config->EnableUseGpu(100, 0);
    if (use_tensorrt) {
      config->EnableTensorRtEngine(1 << 10, batch_size);
      config->pass_builder()->DeletePass("conv_bn_fuse_pass");
      config->pass_builder()->DeletePass("fc_fuse_pass");
      config->pass_builder()->TurnOnDebug();
    } else {
-      config->enable_ir_optim = true;
+      config->SwitchIrOptim();
    }
  }
 }
@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {

  std::vector<PaddleTensor> outputs;
  if (use_analysis || use_tensorrt) {
-    contrib::AnalysisConfig config(true);
+    contrib::AnalysisConfig config;
+    config.EnableUseGpu(100, 0);
    config.pass_builder()->TurnOnDebug();
    SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
                                       FLAGS_batch_size);
@@ -100,23 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) {
    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
  }

-  std::vector<PaddleTensor> native_outputs;
-  NativeConfig native_config;
-  SetConfig<NativeConfig>(&native_config, model_dir, true, false,
-                          FLAGS_batch_size);
-  TestOneThreadPrediction(
-      reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
-      &native_outputs, false);
-
-  std::vector<PaddleTensor> analysis_outputs;
-  contrib::AnalysisConfig analysis_config(true);
+  contrib::AnalysisConfig analysis_config;
  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
                                     use_tensorrt, FLAGS_batch_size);
-  TestOneThreadPrediction(
-      reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all,
-      &analysis_outputs, true);
-
-  CompareResult(native_outputs, analysis_outputs);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config),
+      inputs_all);
 }

 TEST(TensorRT_mobilenet, compare) {
@@ -154,9 +142,9 @@ TEST(TensorRT_mobilenet, analysis) {

 TEST(AnalysisPredictor, use_gpu) {
  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  AnalysisConfig config(true);
-  config.model_dir = model_dir;
-  config.fraction_of_gpu_memory = 0.15;
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
  config.pass_builder()->TurnOnDebug();

  std::vector<std::vector<PaddleTensor>> inputs_all;

--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
        layout, framework::vectorize2int(filter->dims()), groups);

+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Enable Tensor Core for cudnn backward
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      VLOG(5) << "use cudnn_tensor_op_math for backward";
+    } else {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+      VLOG(5) << "NOT use cudnn_tensor_op_math for backward";
+    }
+#endif
+
    int input_channels = input->dims()[1];
    int input_height, input_width, input_depth;
    if (input->dims().size() == 5) {

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 #endif

  auto input_data_type = ctx.Input<Tensor>("Input")->type();
-  auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
-  PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
-                    "input and filter data type should be consistent");
-
+  if (input_data_type != framework::proto::VarType::INT8 &&
+      input_data_type != framework::proto::VarType::UINT8) {
+    auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
+    PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
+                      "input and filter data type should be consistent");
+  }
  if (input_data_type == framework::proto::VarType::FP16) {
    PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
                      "float16 can only be used when CUDNN is used");
@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() {
                "whenever convolution output is as an input to residual "
                "connection.")
      .SetDefault(false);
+  AddAttr<float>("Scale_in",
+                 "Scale_in to be used for int8 input data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Scale_out",
+                 "Scale_out to be used for int8 output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Scale_in_eltwise",
+                 "Scale_in_eltwise to be used for int8 eltwise input data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Force INT8 kernel output FP32, only "
+                "used in MKL-DNN INT8")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() {
      "Defaults to \"NHWC\". Specify the data format of the output data, "
      "the input will be transformed automatically. ")
      .SetDefault("AnyLayout");
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Only used in mkldnn INT8 kernel")
+      .SetDefault(false);
  // TODO(dzhwinter): need to registered layout transform function
  AddAttr<int>("workspace_size_MB",
               "Only used in cudnn kernel. workspace size for cudnn, in MB, "

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -29,6 +29,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 constexpr int kConvMKLDNNFP32 = 1;
 constexpr int kConvMKLDNNINT8 = 2;
+constexpr int MaxKeyLength = 256;

 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.

--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -32,7 +32,7 @@ namespace paddle {
 namespace operators {
 namespace distributed {

-using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
@@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection(
  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
  auto* out_tensor =
      scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
+
+  PADDLE_ENFORCE_GT(
+      out_tensor->numel(), 0,
+      "When calling this method, the LoDTensor's numel must larger than zero. "
+      "Please check LoDTensor::Resize has been called first.");
+
  auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());

  bool is_on_cpu_place = true;
@@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection(

      auto row_numel = dims[1];

-      for (size_t i = 0; i < dims[0]; ++i) {
+      for (int64_t i = 0; i < dims[0]; ++i) {
        auto id = ids_in_this_section[i];
        auto origin_id = id + abs_sections[section_idx];
        auto& offsets = id_to_offset[origin_id];
@@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name,
              const std::vector<std::string>& table_names,
              const std::vector<std::string>& epmap,
              const std::vector<int>& height_sections,
-              const framework::ExecutionContext& context) {
-  auto& local_scope = context.scope().NewScope();
+              const framework::ExecutionContext& context,
+              const framework::Scope& scope) {
+  auto& local_scope = scope.NewScope();

  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
@@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name,
    out_var_names.push_back(out_name + "@" + epmap[i]);
  }

-  auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
  std::vector<int64_t> ids_vector;
  if (platform::is_cpu_place(id_tensor.place())) {
    auto* id_data = id_tensor.data<int64_t>();
-    for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
      ids_vector.push_back(id_data[i]);
    }
  } else {
@@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
    PADDLE_THROW("paddle is not compiled with CUDA!");
 #else
    auto cpu_place = platform::CPUPlace();
-    framework::Tensor cpu_tensor;
+    framework::LoDTensor cpu_tensor;
    auto* cpu_tensor_data =
        cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
    auto stream =
@@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
  MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
                                    out_var_names, height_sections, splited_ids,
                                    context, &local_scope, &actual_ctx);
-
-  context.scope().DeleteScope(&local_scope);
+  scope.DeleteScope(&local_scope);
 }

 };  // namespace distributed

--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name,
              const std::vector<std::string>& table_names,
              const std::vector<std::string>& epmap,
              const std::vector<int>& height_sections,
-              const framework::ExecutionContext& context);
+              const framework::ExecutionContext& context,
+              const framework::Scope& scope);
+
+template <typename T>
+void prefetch_with_reconstruct(const std::string& id_name,
+                               const std::string& out_name,
+                               const std::vector<std::string>& table_names,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<int>& height_sections,
+                               const framework::ExecutionContext& context,
+                               const framework::Scope& scope,
+                               framework::LoDTensor* original) {
+  prefetch(id_name, out_name, table_names, epmap, height_sections, context,
+           scope);
+  auto& out = scope.FindVar(out_name)->Get<framework::LoDTensor>();
+  auto& ids = scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  auto* original_value = original->data<T>();
+  auto* out_value = out.data<T>();
+  size_t original_width = original->numel() / original->dims()[0];
+
+  bool is_on_cpu_place = true;
+  if (!platform::is_cpu_place(ids.place())) {
+    is_on_cpu_place = false;
+  }
+  if (is_on_cpu_place) {
+    for (int64_t i = 0; i < ids.numel(); i++) {
+      const T* out_rows = out_value + original_width * i;
+      T* original_row =
+          original_value + original_width * ids.data<int64_t>()[i];
+      std::memcpy(original_row, out_rows, original_width * sizeof(T));
+    }
+  } else {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& actual_ctx = *pool.Get(context.GetPlace());
+    for (int64_t i = 0; i < ids.numel(); i++) {
+      const T* out_rows = out_value + original_width * i;
+      T* original_row =
+          original_value + original_width * ids.data<int64_t>()[i];
+      auto stream =
+          static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(ids.place()), original_row,
+                   platform::CPUPlace(), out_rows, original_width * sizeof(T),
+                   stream);
+    }
+#endif
+  }
+}

 };  // namespace distributed
 };  // namespace operators

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;

 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub_grad,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+template <typename T>
+struct EmbeddingVSumFunctor {
+  void operator()(const framework::ExecutionContext &context,
+                  const LoDTensor *table_t, const LoDTensor *ids_t,
+                  LoDTensor *output_t) {
+    auto *table = table_t->data<T>();
+    int64_t row_number = table_t->dims()[0];
+    int64_t row_width = table_t->dims()[1];
+    int64_t last_dim = output_t->dims()[1];
+    const int64_t *ids = ids_t->data<int64_t>();
+    auto ids_lod = ids_t->lod()[0];
+    int64_t ids_count = ids_t->numel() / ids_lod.back();
+
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
+      size_t begin = ids_lod[i] * ids_count;
+      for (int64_t j = 0; j != ids_count; ++j) {
+        PADDLE_ENFORCE_LT(ids[begin], row_number);
+        PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i);
+        blas.VCOPY(row_width, table + ids[begin + j] * row_width,
+                   output + i * last_dim + j * row_width);
+      }
+
+      for (int64_t r = (ids_lod[i] + 1) * ids_count;
+           r < ids_lod[i + 1] * ids_count; ++r) {
+        PADDLE_ENFORCE_LT(ids[r], row_number);
+        PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i);
+        blas.AXPY(row_width, 1., table + ids[r] * row_width,
+                  output + i * last_dim + (r % ids_count) * row_width);
+      }
+    }
+  }
+};
+
+template <typename T>
+class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const LoDTensor *ids_t = context.Input<LoDTensor>("Ids");  // int tensor
+    LoDTensor *output_t = context.Output<LoDTensor>("Out");    // float tensor
+    const LoDTensor *table_var = context.Input<LoDTensor>("W");
+    const std::string &combiner_type = context.Attr<std::string>("combiner");
+
+    if (combiner_type == "sum") {
+      EmbeddingVSumFunctor<T> functor;
+      functor(context, table_var, ids_t, output_t);
+    }
+  }
+};
+
+template <typename T>
+class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
+    }
+
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    if (is_sparse) {
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+
+      auto *ids_data = ids->data<int64_t>();
+      int64_t ids_num = ids->numel();
+      auto lod = ids->lod()[0];
+      int64_t row_width = d_output->dims()[1];
+
+      framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
+      new_rows->resize(ids_num);
+      std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t));
+
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_num, table_dim[1]});
+      T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
+      const T *d_output_data = d_output->data<T>();
+
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+        int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+        int64_t in_offset = lod[i] * row_width;
+        const T *out_pos = d_output_data + i * row_width;
+        T *in_pos = d_table_data + in_offset;
+        for (int r = 0; r != h; ++r) {
+          blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
+        }
+      }
+    } else {
+      LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -190,6 +190,26 @@ void BenchGRUKernel() {
  }
 }

+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchSeqPoolKernel() {
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
+  for (auto type : pool_types) {
+    for (int w : TestSizes()) {
+      jit::seq_pool_attr_t attr(w, type);
+      for (int h : TestSizes()) {
+        attr.h = h;
+        std::vector<T> x(h * w), y(w);
+        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
+        const T* x_data = x.data();
+        T* y_data = y.data();
+        BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
+                                                            y_data, &attr);
+      }
+    }
+  }
+}
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
  BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
+
+  // seq pool function
+  BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
 }
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
 USE_JITKERNEL_GEN(kGRUHtPart1)
 USE_JITKERNEL_GEN(kGRUHtPart2)
 USE_JITKERNEL_GEN(kNCHW16CMulNC)
+USE_JITKERNEL_GEN(kSeqPool)
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
--- a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
--- a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
--- a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_ir_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_graph.py
--- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py