Merge remote-tracking branch 'upstream/develop' into debug/support

9adb158e · peizhilin · c1235c93 · e33427da · 9adb158e · 9adb158e
149 changed file
--- a/cmake/FindJeMalloc.cmake
+++ b/cmake/FindJeMalloc.cmake
@@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL
 mark_as_advanced(
  JEMALLOC_LIBRARIES
  JEMALLOC_INCLUDE_DIR)
+if (JEMALLOC_FOUND)
+  add_library(jemalloc::jemalloc UNKNOWN IMPORTED)
+  set_target_properties(jemalloc::jemalloc PROPERTIES
+    IMPORTED_LOCATION ${JEMALLOC_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}")
+endif()
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -5,6 +5,8 @@ endif()
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
 ######################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@@ -59,7 +61,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
  set(archs_name_default "All")
  if(NOT CMAKE_CROSSCOMPILING)
    list(APPEND archs_names "Auto")
@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable)
    set(cuda_arch_bin "60 61")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
+    set(cuda_arch_bin "75")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
+elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
+elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()
 include_directories(${CUDA_INCLUDE_DIRS})

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -23,11 +23,8 @@ set(BOOST_PROJECT       "extern_boost")
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
-if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
+set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-    message(STATUS "use pre defined download url")
+set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
-    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
-endif()
 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
 LIST(APPEND external_project_dependencies gflags)
+# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
+if (WIN32)
+  include(CheckIncludeFileCXX)
+  check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
+  if (HAVE_SHLWAPI)
+    set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
+  endif(HAVE_SHLWAPI)
+endif (WIN32)
 IF(WITH_C_API)
  INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
  IF(ANDROID)

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -55,7 +55,7 @@ ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
-    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_REPOSITORY      "https://github.com/intel/mkl-dnn.git"
    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
+IF(APPLE)
+    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
+    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
+    return()
+ENDIF()
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-if(WIN32)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+SET(TIME_VERSION "2019.0.1.20181227")
+IF(WIN32)
+    SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-else()
+ELSE()  
+    SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
+    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
-endif()
+ENDIF()
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
-IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-    MESSAGE(STATUS "use pre defined download url")
-    if(WIN32)
-        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
-    elseif(APPLE)
-        SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    else()
-        SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    ENDIF()
-endif()
 SET(MKLML_PROJECT       "extern_mklml")
 MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")

--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9")
+SET(NGRAPH_GIT_TAG         "20bd8bbc79ae3a81c57313846a2be7313e5d1dab")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
-SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    SET(NGRAPH_TBB_LIB_NAME    libtbb_debug.so.2)
+else()
+    SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
+endif()
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
 SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
 SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
@@ -66,16 +70,7 @@ ExternalProject_Add(
    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
    CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
-)
+    CMAKE_ARGS          -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
-# Workaround for nGraph expecting mklml to be in mkldnn install directory.
-ExternalProject_Add_Step(
-    ${NGRAPH_PROJECT}
-    PrepareMKL
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
-    DEPENDEES download
-    DEPENDERS configure
 )
 add_dependencies(ngraph ${NGRAPH_PROJECT})

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -117,7 +117,7 @@ function(common_link TARGET_NAME)
  endif()
  if (WITH_JEMALLOC)
-    target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES})
+    target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
  endif()
 endfunction()
@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
    common_link(${TARGET_NAME})
  endif()
+  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+  target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
 endfunction(cc_binary)
 function(cc_test TARGET_NAME)
@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    if(WIN32)
-      list(APPEND win32_deps shlwapi)
      if("${cc_test_DEPS};" MATCHES "python;")
        list(REMOVE_ITEM cc_test_DEPS python)
-        list(APPEND win32_deps ${PYTHON_LIBRARIES})
+        target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
      endif()
    endif(WIN32)
-    add_executable(${TARGET_NAME} ${cc_test_SRCS})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    if(WIN32)
-      target_link_libraries(${TARGET_NAME} ${win32_deps})
-    endif(WIN32)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    common_link(${TARGET_NAME})
    add_test(NAME ${TARGET_NAME}
@@ -451,7 +450,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
@@ -538,7 +538,8 @@ function(hip_test TARGET_NAME)
    endif()
    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
-    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules})
    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -88,6 +88,7 @@ paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'poo
 paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
+paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -210,6 +211,7 @@ paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], va
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
 paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -405,28 +407,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None
 paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
 paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
 paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
+paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
+paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
+paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
+paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
+paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
+paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
+paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
 paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
+paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fuse_elewise_add_act_pass multi_batch_merge_pass
-        memory_optimize_pass)
+        memory_optimize_pass lock_free_optimize_pass)
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
+USE_PASS(lock_free_optimize_pass);
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -226,7 +226,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
   * Only variables should be the leaves of graph.
   */
  AddOutputToLeafOps(&result);
-  result.Erase<GraphOps>(kGraphOps);
+  result.Erase(kGraphOps);
  return graph;
 }

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
+pass_library(lock_free_optimize_pass base)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -41,11 +42,23 @@ pass_library(seq_concat_fc_fuse_pass inference)
 pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
+pass_library(seqpool_concat_fuse_pass inference)
 pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
+pass_library(transpose_flatten_concat_fuse_pass inference)
+# There may be many transpose-flatten structures in a model, and the output of
+# these structures will be used as inputs to the concat Op. This pattern will
+# be detected by our pass. The index here represents the number of structures in the
+# pattern. We use index 3 ~ 6, because these quantities of structures are
+# common in the models.
+foreach (index RANGE 3 6)
+   file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
+endforeach()
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)
@@ -67,6 +80,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -109,7 +109,6 @@ class Graph {
    attr_dels_[attr_name] = []() {};
  }
-  template <typename AttrType>
  void Erase(const std::string &attr_name) {
    PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph",
                   attr_name);

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
  return ac_out_var;
 }
+// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
+// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
+// ...
+// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
+// flatten_out_a -> concat_op  flatten_out_b -> concat_op ... flatten_out_z ->
+// concat_op
+PDNode *patterns::TransposeFlattenConcat::operator()(
+    std::vector<PDNode *> conv_in, int times) {
+  // The times represents the repeat times of the
+  // {trans, trans_out, flatten, flatten_out}
+  const int kNumFields = 4;
+  const int kTransOutOffset = 1;
+  const int kFlattenOffset = 2;
+  const int kFlattenOutOffset = 3;
+  std::vector<PDNode *> nodes;
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("transpose" + std::to_string(i)))
+            ->assert_is_op("transpose2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("transpose_out" + std::to_string(i)))
+            ->assert_is_op_output("transpose2")
+            ->assert_is_op_input("flatten2", "X")
+            ->AsIntermediate());
+    nodes.push_back(pattern->NewNode(GetNodeName("flatten" + std::to_string(i)))
+                        ->assert_is_op("flatten2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("flatten_out" + std::to_string(i)))
+            ->assert_is_op_output("flatten2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+  }
+  auto concat_op = pattern->NewNode(GetNodeName("concat"))
+                       ->assert_is_op("concat")
+                       ->assert_op_has_n_inputs("concat", times);
+  auto concat_out = pattern->NewNode(GetNodeName("concat_out"))
+                        ->assert_is_op_output("concat")
+                        ->AsOutput();
+  std::vector<PDNode *> flatten_outs;
+  for (int i = 0; i < times; i++) {
+    conv_in[i]->AsInput();
+    // trans
+    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
+    // trans_out
+    nodes[i * kNumFields + kTransOutOffset]->LinksFrom({nodes[i * kNumFields]});
+    // flatten
+    nodes[i * kNumFields + kFlattenOffset]->LinksFrom(
+        {nodes[i * kNumFields + kTransOutOffset]});
+    // flatten_out
+    nodes[i * kNumFields + kFlattenOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kFlattenOffset]});
+    flatten_outs.push_back(nodes[i * kNumFields + kFlattenOutOffset]);
+  }
+  concat_op->LinksFrom(flatten_outs).LinksTo({concat_out});
+  return concat_out;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
  PATTERN_DECL_NODE(ac_out);  // Out
 };
+struct TransposeFlattenConcat : public PatternBase {
+  TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
 }  // namespace patterns
 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h"
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+const char kSumGradOpName[] = "sum";
+// TODO(minqiyang): only support sgd at current time, please add
+// other optimizers later.
+const char kOptimizerType[] = "sgd";
+std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  // We could collect all weights' name from SGD, where
+  // W1 <- SGD(W0, Grad0)
+  std::unordered_set<std::string> weight_var_set;
+  for (auto* node : graph->Nodes()) {
+    if (IsOpNamed(node, kOptimizerType)) {
+      auto& param_out_vars = node->Op()->Output("ParamOut");
+      PADDLE_ENFORCE(param_out_vars.size() == 1u);
+      weight_var_set.insert(param_out_vars[0]);
+    }
+  }
+  // find all grad's merge op via weight name, where
+  // Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
+  std::unordered_set<ir::Node*> grad_sum_op_set;
+  for (ir::Node* node : graph->Nodes()) {
+    if (IsOpNamed(node, kSumGradOpName)) {
+      for (ir::Node* output : node->outputs) {
+        // strip the last grad suffix @GRAD
+        std::string var_name = output->Name();
+        const std::string suffix(kGradVarSuffix);
+        if (var_name != suffix && var_name.size() > suffix.size() &&
+            var_name.substr(var_name.size() - suffix.size()) == suffix) {
+          // if so then strip them off
+          var_name = var_name.substr(0, var_name.size() - suffix.size());
+          if (weight_var_set.find(var_name) != weight_var_set.end()) {
+            grad_sum_op_set.insert(node);
+            break;
+          }
+        }
+      }
+    }
+  }
+  // get the forward op and backward op pairs, where
+  // out <- forward(X, W)
+  // Grad1 <- backward(out, X')
+  // Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
+  // W0 <- SGD(W1, Grad0)
+  for (ir::Node* node : grad_sum_op_set) {
+    for (ir::Node* merged_grad_var : node->outputs) {
+      // find the optimizers connected with sum op
+      if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) &&
+          merged_grad_var->outputs.size() == 1u) {
+        ir::Node* opt_node = merged_grad_var->outputs[0];
+        VLOG(3) << "Found opt node " << opt_node->Name();
+        // find the backward op connected with sum op
+        for (ir::Node* unmerged_grad_var : node->inputs) {
+          if (IsVarNameContains(unmerged_grad_var, kGradVarSuffix) &&
+              unmerged_grad_var->inputs.size() == 1u) {
+            ir::Node* backward_op = unmerged_grad_var->inputs[0];
+            VLOG(3) << "Found backward_op " << backward_op->Name();
+            // find the forward op related to the backward op
+            ir::Node* forward_op =
+                FindForwardOpViaBackwardOp(graph.get(), backward_op);
+            VLOG(3) << "Found forward_op " << forward_op->Name();
+            PADDLE_ENFORCE(forward_op);
+            Node* new_optimizer_node = CreateNewSGDNode(
+                graph.get(), forward_op, backward_op, node, opt_node);
+            PADDLE_ENFORCE(new_optimizer_node);
+          }
+        }
+      }
+    }
+  }
+  // Remove the sum_op and its' outputs and connected Optimizers
+  for (Node* sum_op : grad_sum_op_set) {
+    for (Node* sum_op_output : sum_op->outputs) {
+      for (Node* optimize_op : sum_op_output->outputs) {
+        if (optimize_op->NodeType() == Node::Type::kOperation &&
+            optimize_op->Name() == kOptimizerType) {
+          VLOG(3) << "remove optimize_op: " << optimize_op->Name() << "_"
+                  << optimize_op->id();
+          graph->RemoveNode(optimize_op);
+        }
+      }
+      VLOG(3) << "remove sum_op_output: " << sum_op_output->Name() << "_"
+              << sum_op_output->id();
+      graph->RemoveNode(sum_op_output);
+    }
+    VLOG(3) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id();
+    graph->RemoveNode(sum_op);
+  }
+  for (auto* node : graph->Nodes()) {
+    for (Node* output_node : node->outputs) {
+      if (output_node->Name() == "sgd") {
+        VLOG(3) << "Node link to SGD: " << node->Name() << "_" << node->id()
+                << " --> " << output_node->Name() << "_" << output_node->id();
+        for (Node* input_node : node->inputs) {
+          VLOG(3) << "SGD Input link: " << input_node->Name() << "_"
+                  << input_node->id() << " --> " << node->Name() << "_"
+                  << node->id();
+        }
+      }
+    }
+  }
+  return graph;
+}
+ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
+    ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node,
+    ir::Node* grad_sum_node, ir::Node* optimize_node) const {
+  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE(forward_node);
+  PADDLE_ENFORCE(backward_node);
+  PADDLE_ENFORCE(grad_sum_node);
+  PADDLE_ENFORCE(optimize_node);
+  // find the grad var node between the grad sum node and backward_node
+  std::vector<ir::Node*> grad_vars =
+      FindConnectedNode(backward_node, grad_sum_node);
+  ir::Node* grad_node = nullptr;
+  for (ir::Node* node : grad_vars) {
+    if (!ir::IsControlDepVar(*node)) {
+      grad_node = node;
+    }
+  }
+  PADDLE_ENFORCE(grad_node);
+  // create a new SGD node
+  OpDesc* old_desc = optimize_node->Op();
+  // keep with the same block between new optimizer and the old one
+  OpDesc new_desc(*old_desc, old_desc->Block());
+  new_desc.SetInput("Param", old_desc->Input("Param"));
+  new_desc.SetInput("LearningRate", old_desc->Input("LearningRate"));
+  new_desc.SetInput("Grad", std::vector<std::string>({grad_node->Name()}));
+  new_desc.SetOutput("ParamOut", old_desc->Output("ParamOut"));
+  std::vector<std::string> op_role_vars = boost::get<std::vector<std::string>>(
+      new_desc.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+  // replace the second op role var, because the grad name was
+  // changed in new optimizer
+  op_role_vars.pop_back();
+  op_role_vars.push_back(grad_node->Name());
+  new_desc.SetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                   op_role_vars);
+  new_desc.SetType(kOptimizerType);
+  // set backward op's op role var, this will be used to
+  // set device_id in multi_device_pass
+  backward_node->Op()->SetAttr(
+      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), op_role_vars);
+  // backward_node->Op()->SetAttr(
+  // framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {});
+  // keep with the same output nodes between new optimizer and the
+  // old one
+  Node* sgd_node = graph->CreateOpNode(&new_desc);
+  // change all outputs of the optimize_node to the new one
+  ReplaceAllDownstreamNode(optimize_node, sgd_node);
+  // find connected node between forward node and optimize node
+  // and replace the optimize node to new sgd node
+  std::vector<ir::Node*> forward_opt_connected_nodes =
+      FindConnectedNode(forward_node, optimize_node);
+  for (ir::Node* node : forward_opt_connected_nodes) {
+    ReplaceUpstreamNode(node, optimize_node, sgd_node);
+  }
+  // find connected node between backward node and optimize node
+  // and replace the optimize node to new sgd node
+  std::vector<ir::Node*> backward_opt_connected_nodes =
+      FindConnectedNode(backward_node, optimize_node);
+  for (ir::Node* node : backward_opt_connected_nodes) {
+    ReplaceUpstreamNode(node, optimize_node, sgd_node);
+  }
+  // SGD must have only one param and LR in
+  PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u);
+  PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u);
+  // LR and weight nodes should be copied
+  for (Node* upstream_node : optimize_node->inputs) {
+    if (upstream_node->Name() == old_desc->Input("LearningRate")[0] ||
+        upstream_node->Name() == old_desc->Input("Param")[0]) {
+      ReplaceUpstreamNode(upstream_node, optimize_node, sgd_node);
+    }
+  }
+  VLOG(3) << "Create new opt node" << sgd_node->Name() << "_" << sgd_node->id();
+  return sgd_node;
+}
+std::vector<ir::Node*> LockFreeOptimizePass::FindConnectedNode(
+    ir::Node* upstream_node, ir::Node* downstream_node) const {
+  std::vector<ir::Node*> result;
+  for (ir::Node* out_node : upstream_node->outputs) {
+    for (ir::Node* in_node : downstream_node->inputs) {
+      if (in_node == out_node) {
+        result.push_back(in_node);
+      }
+    }
+  }
+  return result;
+}
+void LockFreeOptimizePass::ReplaceUpstreamNode(
+    ir::Node* upstream_node, ir::Node* old_optimizer_node,
+    ir::Node* new_optimizer_node) const {
+  PADDLE_ENFORCE(upstream_node);
+  PADDLE_ENFORCE(old_optimizer_node);
+  PADDLE_ENFORCE(new_optimizer_node);
+  // Remove the old_optimizer_node from upstream_node's outputs vector
+  auto& output_node_vec = upstream_node->outputs;
+  for (auto output_node_iter = output_node_vec.begin();
+       output_node_iter != output_node_vec.end();) {
+    if (*output_node_iter == old_optimizer_node) {
+      output_node_vec.erase(output_node_iter);
+      break;
+    } else {
+      ++output_node_iter;
+    }
+  }
+  // Add the new_optimizer_node to upstream_node's outputs vector
+  output_node_vec.emplace_back(new_optimizer_node);
+  new_optimizer_node->inputs.emplace_back(upstream_node);
+}
+void LockFreeOptimizePass::ReplaceAllDownstreamNode(
+    ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const {
+  PADDLE_ENFORCE(old_optimizer_node);
+  PADDLE_ENFORCE(new_optimizer_node);
+  for (ir::Node* downstream_node : old_optimizer_node->outputs) {
+    // Remove the old_optimizer_node from downstream_node's inputs vector
+    auto& input_node_vec = downstream_node->inputs;
+    for (auto input_node_iter = input_node_vec.begin();
+         input_node_iter != input_node_vec.end();) {
+      if (*input_node_iter == old_optimizer_node) {
+        input_node_vec.erase(input_node_iter);
+        break;
+      } else {
+        ++input_node_iter;
+      }
+    }
+    // Add the new_optimizer_node to downstream_node's inputs vector
+    input_node_vec.emplace_back(new_optimizer_node);
+    new_optimizer_node->outputs.emplace_back(downstream_node);
+  }
+}
+ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp(
+    ir::Graph* graph, ir::Node* backward_node) const {
+  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE(backward_node);
+  // strip the suffix _grad of backward_node's name
+  std::string forward_op_name = backward_node->Name();
+  const std::string suffix("_grad");
+  if (forward_op_name != suffix && forward_op_name.size() > suffix.size() &&
+      forward_op_name.substr(forward_op_name.size() - suffix.size()) ==
+          suffix) {
+    // if so then strip them off
+    forward_op_name =
+        forward_op_name.substr(0, forward_op_name.size() - suffix.size());
+  } else {
+    LOG(WARNING) << "Illegal backward node's name " << backward_node->Name()
+                 << " id " << backward_node->id();
+    return nullptr;
+  }
+  for (ir::Node* node : graph->Nodes()) {
+    if (node->Name() == forward_op_name) {
+      if (node->outputs.size() == 0u) {
+        // if forward_node has no output, then it has NO grad op
+        continue;
+      }
+      // check whether all inputs of the backward_op that ends_with @GRAD
+      // comes from the output of forward_op is the input of the backward_op
+      bool is_related_forward_node = true;
+      for (ir::Node* backward_input : backward_node->inputs) {
+        if (IsVarNameEndsWith(backward_input, kGradVarSuffix)) {
+          bool meets_correct_output = false;
+          for (ir::Node* forward_output : node->outputs) {
+            if (forward_output->Name() + kGradVarSuffix ==
+                backward_input->Name()) {
+              meets_correct_output = true;
+              break;
+            }
+          }
+          if (!meets_correct_output) {
+            is_related_forward_node = false;
+            break;
+          }
+        }
+      }
+      if (is_related_forward_node) {
+        return node;
+      }
+    }
+  }
+  return nullptr;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(lock_free_optimize_pass,
+              paddle::framework::ir::LockFreeOptimizePass);
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
+#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
+#include <string>
+#include <vector>
+#include <boost/algorithm/string/predicate.hpp>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+/*
+* Remove the sum op of all gradients of the backward op.
+* And remove the dependecies of the optimizer related to the
+* same backward op.
+*
+* Before this pass:
+*
+* forward_op1 forward_op2
+*     |            |
+*  grad_op1    grad_op2
+*        \      /
+*          \  /
+*         sum_op
+*           |
+*         sgd_op
+*
+* After this pass:
+* forward_op1 forward_op2
+*     |            |
+*  grad_op1    grad_op2
+*     |            |
+*  sgd_op1      sgd_op2
+*
+* sgd_op1 and sgd_op2 will update the same weight which holds the same
+* memory, so we could benefits from the acceleration
+*/
+class LockFreeOptimizePass : public Pass {
+ public:
+  virtual ~LockFreeOptimizePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+ private:
+  // Create a new sgd node via current optimizer node
+  ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node,
+                             ir::Node* backward_node, ir::Node* grad_sum_node,
+                             ir::Node* optimize_node) const;
+  // Replace the input weight's optimizers
+  void ReplaceUpstreamNode(ir::Node* upstream_node,
+                           ir::Node* old_optimizer_node,
+                           ir::Node* new_optimizer_node) const;
+  // Replace the output weight's optimizers
+  void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node,
+                                ir::Node* new_optimizer_node) const;
+  // Find all weight variables in graph
+  bool FindAllWeightVars(ir::Graph* graph) const;
+  // Find the forward_op node via the backward_op node
+  ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph,
+                                       ir::Node* backward_node) const;
+  std::vector<ir::Node*> FindConnectedNode(ir::Node* upstream_node,
+                                           ir::Node* downstream_node) const;
+  inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+    return node->NodeType() == Node::Type::kOperation && node->Name() == name;
+  }
+  inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+    return node->NodeType() == Node::Type::kVariable && node->Name() == name;
+  }
+  inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+    return node->NodeType() == Node::Type::kVariable &&
+           boost::algorithm::ends_with(node->Name(), name);
+  }
+  inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+    return node->NodeType() == Node::Type::kVariable &&
+           node->Name().find(name) != std::string::npos;
+  }
+  inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
+    PADDLE_ENFORCE(ctrl_dep_node);
+    PADDLE_ENFORCE(node);
+    return IsControlDepVar(*ctrl_dep_node) &&
+           ctrl_dep_node->inputs.size() >= 1u &&
+           ctrl_dep_node->inputs[0] == node;
+  }
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+#endif  // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#define MAX_CONCAT_INPUTS 200
+namespace paddle {
+namespace framework {
+namespace ir {
+PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
+                                  const std::string& name_scope,
+                                  int num_inputs) {
+  auto is_concat_op_with_inputs = [](Node* x, int num) -> bool {
+    return x && x->IsOp() && x->Op()->Type() == "concat" &&
+           x->Op()->Input("X").size() == static_cast<size_t>(num);
+  };
+  auto is_nth_input_var_of_concat = [=](Node* x, int idx) -> bool {
+    return x && x->IsVar() && VarLinksToOp(x, "concat") &&
+           x->outputs.size() == 1 && IsNthInput(x, x->outputs[0], "X", idx) &&
+           is_concat_op_with_inputs(x->outputs[0], num_inputs);
+  };
+  auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=](
+      Node* x, const std::string& type, int idx) -> bool {
+    bool this_is_seqpool_op =
+        x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
+        x->Op()->HasAttr("pooltype") &&
+        boost::get<std::string>(x->Op()->GetAttr("pooltype")) == type &&
+        x->outputs.size() == 2;  // seqpool should only have 2 outputs
+    bool satisfied_all = this_is_seqpool_op;
+    if (this_is_seqpool_op) {
+      // Only one output of seqpool_op is nth_input_var of concat,
+      // the other one should be unused empty var.
+      if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
+        satisfied_all = satisfied_all && x->outputs[1]->IsVar() &&
+                        x->outputs[1]->outputs.empty();
+      } else {
+        satisfied_all =
+            satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) &&
+            x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0;
+      }
+    }
+    return satisfied_all;
+  };
+  auto* concat_op = pattern->NewNode(
+      [=](Node* x) { return is_concat_op_with_inputs(x, num_inputs); },
+      name_scope + "/concat_op");
+  concat_op->assert_op_attr<int>("axis", 1);
+  auto* concat_out_var = pattern->NewNode(
+      [=](Node* x) {
+        return x && x->IsVar() && VarLinksFromOp(x, "concat") &&
+               x->inputs.size() == 1 &&
+               is_concat_op_with_inputs(x->inputs[0], num_inputs);
+      },
+      name_scope + "/concat_out_var");
+  concat_out_var->assert_is_only_output_of_op("concat");
+  std::vector<PDNode*> seqpool_ops_input_var(num_inputs);
+  std::vector<PDNode*> seqpool_ops_output_var(num_inputs);
+  std::vector<PDNode*> seqpool_ops_output_unused_var(num_inputs);
+  std::vector<PDNode*> seqpool_ops(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    seqpool_ops_output_var[i] = pattern->NewNode(
+        [=](Node* x) {
+          return x && x->IsVar() && is_nth_input_var_of_concat(x, i) &&
+                 x->inputs.size() == 1 &&
+                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
+                                                                   "SUM", i);
+        },
+        name_scope + "/sequence_pool_out_" + std::to_string(i));
+    seqpool_ops_output_unused_var[i] = pattern->NewNode(
+        [=](Node* x) {
+          return x && x->IsVar() && x->inputs.size() == 1 &&
+                 x->outputs.size() == 0 &&
+                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
+                                                                   "SUM", i);
+        },
+        name_scope + "/sequence_pool_unused_out_" + std::to_string(i));
+    seqpool_ops[i] = pattern->NewNode(
+        [=](Node* x) {
+          return x && x->IsOp() &&
+                 is_seqpool_op_with_pootype_of_nth_input_of_concat(x, "SUM", i);
+        },
+        name_scope + "/sequence_pool_op_" + std::to_string(i));
+    seqpool_ops_input_var[i] = pattern->NewNode(
+        [=](Node* x) {
+          bool basic = x && x->IsVar() && x->outputs.size() >= 1;
+          bool next_is_fine = false;
+          for (auto* o : x->outputs) {
+            if (is_seqpool_op_with_pootype_of_nth_input_of_concat(o, "SUM",
+                                                                  i)) {
+              next_is_fine = true;
+              break;
+            }
+          }
+          return basic && next_is_fine;
+        },
+        name_scope + "/sequence_pool_in_" + std::to_string(i));
+    // Links
+    seqpool_ops[i]
+        ->LinksFrom({seqpool_ops_input_var[i]})
+        .LinksTo({seqpool_ops_output_var[i], seqpool_ops_output_unused_var[i]});
+  }
+  concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var});
+  return concat_out_var;
+}
+int BuildFusion(Graph* graph, const std::string& name_scope, int num_inputs) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs);
+  auto retrieve_node = [](const std::string& name,
+                          const GraphPatternDetector::subgraph_t& subgraph,
+                          const PDPattern& pat) -> Node* {
+    PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
+                   "pattern has no Node called %s", name.c_str());
+    Node* p = subgraph.at(pat.RetrieveNode(name));
+    PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str());
+    return p;
+  };
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle SeqPool Concat fuse";
+    std::vector<std::string> input_names(num_inputs);
+    std::vector<Node*> input_vars(num_inputs);
+    auto& fused_pattern = gpd.pattern();
+    for (int i = 0; i < num_inputs; ++i) {
+      input_vars[i] =
+          retrieve_node(name_scope + "/sequence_pool_in_" + std::to_string(i),
+                        subgraph, fused_pattern);
+      input_names[i] = input_vars[i]->Name();
+    }
+    auto* concat_op =
+        retrieve_node(name_scope + "/concat_op", subgraph, fused_pattern);
+    auto* concat_out_var =
+        retrieve_node(name_scope + "/concat_out_var", subgraph, fused_pattern);
+    auto* seqpool_op0 = retrieve_node(name_scope + "/sequence_pool_op_0",
+                                      subgraph, fused_pattern);
+    // Create New OpDesc
+    OpDesc op_desc;
+    op_desc.SetType("fusion_seqpool_concat");
+    op_desc.SetInput("X", input_names);
+    op_desc.SetAttr("pooltype", seqpool_op0->Op()->GetAttr("pooltype"));
+    op_desc.SetAttr("axis", concat_op->Op()->GetAttr("axis"));
+    op_desc.SetOutput("Out", {concat_out_var->Name()});
+    auto* op = graph->CreateOpNode(&op_desc);
+    for (size_t i = 0; i < input_vars.size(); ++i) {
+      IR_NODE_LINK_TO(input_vars[i], op);
+    }
+    IR_NODE_LINK_TO(op, concat_out_var);
+    std::unordered_set<const Node*> marked_nodes;
+    for (auto& item : subgraph) {
+      marked_nodes.insert(item.second);
+    }
+    for (size_t i = 0; i < input_vars.size(); ++i) {
+      marked_nodes.erase(input_vars[i]);
+    }
+    marked_nodes.erase(concat_out_var);
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+  return fusion_count;
+}
+std::unique_ptr<ir::Graph> SeqPoolConcatFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  int fusion_count = 0;
+  for (int i = MAX_CONCAT_INPUTS; i > 0; --i) {
+    fusion_count +=
+        BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i);
+  }
+  AddStatis(fusion_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(seqpool_concat_fuse_pass,
+              paddle::framework::ir::SeqPoolConcatFusePass);
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/**
+ * Fuse SequencePool(with sum pooltype yet) and Concat;
+ *
+ * Before fuse:
+ *    |         |             |
+ * seq_pool, seq_pool, ... seq_pool
+ *    \         |      ...   /
+ *            concat
+ *              |
+ * After fuse:
+ *    \      |       /
+ *   FusionSeqPoolConcat
+ *           |
+ */
+class SeqPoolConcatFusePass : public FusePassBase {
+ public:
+  virtual ~SeqPoolConcatFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"seqpool_concat_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "sequence_pool") {
+    op->SetInput("X", {inputs[0]});
+    std::string pooltype = "SUM";
+    op->SetAttr("pooltype", pooltype);
+    op->SetOutput("MaxIndex", {outputs[0]});
+    op->SetOutput("Out", {outputs[1]});
+  } else if (type == "concat") {
+    op->SetInput("X", inputs);
+    op->SetAttr("axis", 1);
+    op->SetOutput("Out", {outputs[0]});
+  } else {
+    op->SetInput("X", inputs);
+    op->SetOutput("Out", outputs);
+  }
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+}
+int CountOpType(const ir::Graph* graph,
+                const std::string& op_type = "fusion_seqpool_concat") {
+  int count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == op_type) {
+      ++count;
+    }
+  }
+  return count;
+}
+std::unique_ptr<ir::Graph> GetNumNodesOfBeforeAfter(
+    std::unique_ptr<ir::Graph> graph, int* before, int* after,
+    const std::string& pass_type = "seqpool_concat_fuse_pass") {
+  auto pass = PassRegistry::Instance().Get(pass_type);
+  *before = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  *after = graph->Nodes().size();
+  return graph;
+}
+/*
+ * Before fuse:
+ *    a         b         c
+ *    |         |         |
+ *   op1       op2       op3
+ *   / \       / \       / \
+ *  d  e      f   g     h   i
+ *      \         |        /
+ *            concat
+ *              |
+ *              j
+ * Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr
+ *
+ * After fuse:
+ *    a         b         c
+ *    \         |        /
+ *    fusion_seqpool_concat
+ *              |
+ *              j
+ */
+TEST(SeqPoolConcatFusePass, basic) {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+  }
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"d", "e"}));
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"f", "g"}));
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"c"}),
+        std::vector<std::string>({"h", "i"}));
+  SetOp(&prog, "concat", std::vector<std::string>({"e", "g", "i"}),
+        std::vector<std::string>({"j"}));
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int before, after;
+  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
+  // Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op
+  // Add 1 Node: fusion_seqpool_concat
+  EXPECT_EQ(after, before - 9);
+  EXPECT_EQ(CountOpType(graph.get()), 1);
+}
+/*
+ * Before fuse:
+ *    a            b
+ *    |           /  \
+ *   op1        op2  op3
+ *   / \        / \    \
+ *  c  d       e   f    g
+ *      \         /
+ *        concat
+ *          |
+ *          h
+ * Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr
+ *
+ * After fuse:
+ *   a                         b
+ *    \                     /     \
+ *    fusion_seqpool_concat       op3
+ *              |                  |
+ *              h                  g
+ */
+TEST(SeqPoolConcatFusePass, advanced) {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+  }
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c", "d"}));
+  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"e", "f"}));
+  SetOp(&prog, "op3", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"g"}));
+  SetOp(&prog, "concat", std::vector<std::string>({"d", "f"}),
+        std::vector<std::string>({"h"}));
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int before, after;
+  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
+  // Remove 7 Nodes: op1, op2, c, d, e, f concat_op
+  // Add 1 Node: fusion_seqpool_concat
+  EXPECT_EQ(after, before - 6);
+  EXPECT_EQ(CountOpType(graph.get()), 1);
+}
+ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
+  ProgramDesc prog;
+  auto new_var = [&](const std::string& name) {
+    auto* var = prog.MutableBlock(0)->Var(name);
+    var->SetType(proto::VarType::LOD_TENSOR);
+  };
+  std::vector<std::string> concat_inputs;
+  for (int i = 0; i < num_inputs_of_concat; ++i) {
+    std::string prefix = "seqpool_op_" + i;
+    new_var(prefix + "in");
+    new_var(prefix + "out");
+    new_var(prefix + "out_unused");
+    SetOp(&prog, "sequence_pool", std::vector<std::string>({prefix + "in"}),
+          std::vector<std::string>({prefix + "out", prefix + "out_unused"}));
+    concat_inputs.push_back(prefix + "out");
+  }
+  SetOp(&prog, "concat", concat_inputs,
+        std::vector<std::string>({"concat_out"}));
+  return prog;
+}
+// test more inputs of concat
+TEST(SeqPoolConcatFusePass, more_inputs) {
+  for (int num : {1, 2, 10}) {
+    ProgramDesc prog = BuildProgramDesc(num);
+    std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+    int before, after;
+    graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
+    // Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op
+    // Add Node: fusion_seqpool_concat op
+    EXPECT_EQ(after, before - num * 3);
+    EXPECT_EQ(CountOpType(graph.get()), 1);
+  }
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(seqpool_concat_fuse_pass);
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+template <int times>
+std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name =
+      "transpose_flatten" + std::to_string(times) + "_concat_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+  GraphPatternDetector gpd;
+  std::vector<PDNode *> input_nodes;
+  for (int i = 0; i < times; i++) {
+    input_nodes.push_back(gpd.mutable_pattern()
+                              ->NewNode("x" + std::to_string(i))
+                              ->assert_is_op_input("transpose2", "X")
+                              ->AsInput());
+  }
+  patterns::TransposeFlattenConcat pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(input_nodes, times);
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    const int kNumFields = 5;
+    const int kTransOffset = 1;
+    const int kTransOutOffset = 2;
+    const int kFlattenOffset = 3;
+    const int kFlattenOutOffset = 4;
+    std::vector<Node *> nodes;
+    for (int i = 0; i < times; i++) {
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
+      PADDLE_ENFORCE(subgraph.at(input_nodes[i]));
+      nodes.push_back(subgraph.at(input_nodes[i]));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
+    }
+    Node *concat_op = subgraph.at(pattern.GetPDNode("concat"));
+    Node *concat_out = subgraph.at(pattern.GetPDNode("concat_out"));
+    std::vector<std::string> input_names;
+    std::vector<int> trans_axis = boost::get<std::vector<int>>(
+        nodes[kTransOffset]->Op()->GetAttr("axis"));
+    int flatten_axis =
+        boost::get<int>(nodes[kFlattenOffset]->Op()->GetAttr("axis"));
+    int concat_axis = boost::get<int>(concat_op->Op()->GetAttr("axis"));
+    std::string output_name = concat_out->Name();
+    for (int i = 0; i < times; i++) {
+      input_names.push_back(nodes[i * kNumFields]->Name());
+    }
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("fusion_transpose_flatten_concat");
+    new_op_desc.SetInput("X", input_names);
+    new_op_desc.SetAttr("trans_axis", trans_axis);
+    new_op_desc.SetAttr("flatten_axis", flatten_axis);
+    new_op_desc.SetAttr("concat_axis", concat_axis);
+    new_op_desc.SetOutput("Out", {output_name});
+    new_op_desc.Flush();
+    // Create a new node for the fused op.
+    auto *new_conv_op = graph->CreateOpNode(&new_op_desc);
+    std::unordered_set<const Node *> delete_nodes;
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields]->outputs.push_back(new_conv_op);
+      new_conv_op->inputs.push_back(nodes[i * kNumFields]);
+      delete_nodes.insert(nodes[i * kNumFields + kTransOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kTransOutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kFlattenOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kFlattenOutOffset]);
+    }
+    delete_nodes.insert(concat_op);
+    new_conv_op->outputs.push_back(concat_out);
+    concat_out->inputs.push_back(new_conv_op);
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), delete_nodes);
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+template class TransposeFlattenConcatFusePass<1>;
+template class TransposeFlattenConcatFusePass<3>;
+template class TransposeFlattenConcatFusePass<4>;
+template class TransposeFlattenConcatFusePass<5>;
+template class TransposeFlattenConcatFusePass<6>;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(transpose_flatten_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
+REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
+REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
+REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
+REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
+              paddle::framework::ir::TransposeFlattenConcatFusePass<6>);
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+// There may be many transpose-flatten structures in a model, and the output of
+// these structures will be used as inputs to the concat Op. This pattern will
+// be detected by our pass. The times here represents the repeat times of this
+// structure.
+template <int times>
+class TransposeFlattenConcatFusePass : public FusePassBase {
+ public:
+  virtual ~TransposeFlattenConcatFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -32,8 +32,11 @@ std::map<std::string,
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
    NgraphBridge::NG_NODE_MAP = {
        {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
+        {"mean", paddle::operators::ngraphs::BuildMeanNode},
+        {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
        {"mul", paddle::operators::ngraphs::BuildMulNode},
        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
+        {"scale", paddle::operators::ngraphs::BuildScaleNode},
        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
        {"top_k", paddle::operators::ngraphs::BuildTopKNode}};

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -395,7 +395,7 @@ class ExecutionContext {
    PADDLE_ENFORCE(
        dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
        "The AllocationPtr must be TemporaryAllocation.");
-    PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+    PADDLE_ENFORCE_GE(allocation_ptr->size(),
                      framework::product(dim) * sizeof(T));
    paddle::framework::Tensor temp_tensor(

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor(
    const std::unordered_set<std::string> &bcast_vars,
    const ProgramDesc &main_program, const std::string &loss_var_name,
    Scope *scope, const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
-    size_t num_trainers, size_t trainer_id)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
  member_->use_cuda_ = exec_strategy.use_cuda_;
  member_->build_strategy_ = build_strategy;
  member_->use_all_reduce_ =
      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
-  member_->nranks_ = num_trainers * places.size();
+  member_->nranks_ = build_strategy.num_trainers_ * places.size();
  if (!member_->use_all_reduce_) {
    PADDLE_ENFORCE(places.size() > 1,
@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor(
    }
    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
-        member_->places_, nccl_id, num_trainers, trainer_id));
+        member_->places_, nccl_id, build_strategy.num_trainers_,
+        build_strategy.trainer_id_));
 #else
    PADDLE_THROW("Not compiled with CUDA");
 #endif

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -50,8 +50,7 @@ class ParallelExecutor {
                            const std::string &loss_var_name, Scope *scope,
                            const std::vector<Scope *> &local_scopes,
                            const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy,
+                            const BuildStrategy &build_strategy);
-                            size_t num_trainers = 1, size_t trainer_id = 0);
  ~ParallelExecutor();

--- a/paddle/fluid/framework/python_headers.h
+++ b/paddle/fluid/framework/python_headers.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+// workaround for Python 2 issue: https://bugs.python.org/issue17120
+#pragma push_macro("_XOPEN_SOURCE")
+#pragma push_macro("_POSIX_C_SOURCE")
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#include "pybind11/pybind11.h"
+#pragma pop_macro("_XOPEN_SOURCE")
+#pragma pop_macro("_POSIX_C_SOURCE")
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
 }
 Variable* Scope::Var(std::string* name) {
-  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  SCOPE_VARS_WRITER_LOCK
+  auto new_name = std::to_string(reinterpret_cast<uintptr_t>(this)) + "." +
+                  std::to_string(vars_.size());
  if (name != nullptr) {
    *name = new_name;
  }
-  SCOPE_VARS_WRITER_LOCK
  return VarInternal(new_name);
 }

--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {
 }  // namespace detail
-const std::type_index &ToTypeIndex(int var_id) {
+const std::type_index &VarTraitIdToTypeIndex(int var_id) {
  return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
 }
-const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+const char *ToTypeName(int var_id) {
+  return VarTraitIdToTypeIndex(var_id).name();
+}
-int ToTypeId(const std::type_index &type) {
+int TypeIndexToVarTraitId(const std::type_index &type) {
  return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
 }

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -66,8 +66,8 @@ namespace paddle {
 namespace framework {
 const char *ToTypeName(int var_id);
-const std::type_index &ToTypeIndex(int var_id);
+const std::type_index &VarTraitIdToTypeIndex(int var_id);
-int ToTypeId(const std::type_index &type);
+int TypeIndexToVarTraitId(const std::type_index &type);
 namespace detail {

--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -45,10 +45,11 @@ struct TypeIndexChecker {
    constexpr auto kId = VarTypeTrait<Type>::kId;
    std::type_index actual_type(typeid(Type));
    EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
-    EXPECT_EQ(ToTypeIndex(kId), actual_type);
+    EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type);
-    EXPECT_EQ(ToTypeId(actual_type), kId);
+    EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId);
-    EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type);
+    EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)),
-    EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
+              actual_type);
+    EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId);
    EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
    EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -27,6 +27,8 @@
 namespace paddle {
 namespace imperative {
+std::map<int, py::object> py_funcs_;
 using framework::Variable;
 void AddTo(Variable* src, Variable* dst) {
@@ -42,7 +44,7 @@ void AddTo(Variable* src, Variable* dst) {
                 src_tensor->numel());
  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
  const float* src_data = src_tensor->data<float>();
-  for (size_t i = 0; i < src_tensor->numel(); ++i) {
+  for (int64_t i = 0; i < src_tensor->numel(); ++i) {
    dst_data[i] += src_data[i];
  }
 }
@@ -55,6 +57,7 @@ class Autograd {
    if (var->stop_gradient_) {
      return;
    }
+    VLOG(3) << "start autograd";
    std::deque<OpBase*> ready;
    ready.push_back(var->pre_op_);
@@ -114,57 +117,63 @@ class Autograd {
  }
 };
-framework::LoDTensor& VarBase::Grad() {
+framework::LoDTensor& VarBase::GradValue() {
  VLOG(3) << "get var grad " << var_desc_->Name();
-  return *grads_->GetMutable<framework::LoDTensor>();
+  return *(grads_->var_->GetMutable<framework::LoDTensor>());
 }
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (!grad_op_desc_) {
+  if (!grad_op_desc_ && backward_id_ <= 0) {
    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
    return {};
  }
-  VLOG(3) << "op grad " << grad_op_desc_->Type();
-  std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
-  for (auto it : grad_output_vars_) {
+  if (backward_id_ > 0) {
-    auto& outputs = grad_outputs[it.first];
+    VLOG(3) << "py_layer_grad";
-    for (size_t i = 0; i < it.second.size(); ++i) {
+    grad_outputs["Out@GRAD"] =
-      // Allocate a new variable
+        PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"]);
-      Variable* tmp_var = new framework::Variable();
+  } else {
-      tmp_var->GetMutable<framework::LoDTensor>();
+    VLOG(3) << "op grad " << grad_op_desc_->Type();
+    for (auto it : grad_output_vars_) {
-      tmp_vars.emplace_back(tmp_var);
+      auto& outputs = grad_outputs[it.first];
-      outputs.push_back(tmp_var);
+      for (size_t i = 0; i < it.second.size(); ++i) {
+        // Allocate a new variable
+        Variable* tmp_var = new framework::Variable();
+        tmp_var->GetMutable<framework::LoDTensor>();
+        outputs.push_back(tmp_var);
+      }
    }
-  }
-  framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+    framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
-  // No need to do compile time infer shape here.
+    // No need to do compile time infer shape here.
-  // grad_op_desc_->InferShape(*block_);
+    // grad_op_desc_->InferShape(*block_);
-  grad_op_desc_->InferVarType(block_);
+    grad_op_desc_->InferVarType(block_);
-  std::unique_ptr<framework::OperatorBase> opbase =
+    std::unique_ptr<framework::OperatorBase> opbase =
-      framework::OpRegistry::CreateOp(*grad_op_desc_);
+        framework::OpRegistry::CreateOp(*grad_op_desc_);
-  framework::OperatorWithKernel* op_kernel =
+    framework::OperatorWithKernel* op_kernel =
-      dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+        dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
-  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
-  framework::Scope scope;
+    framework::Scope scope;
-  platform::CPUPlace place;
+    platform::CPUPlace place;
-  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-  p.op.RuntimeInferShape(scope, place, ctx);
+    p.op.RuntimeInferShape(scope, place, ctx);
-  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+  }
  for (auto it : grad_output_vars_) {
    auto& outputs = grad_outputs[it.first];
    auto& origin_outputs = it.second;
+    PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
    for (size_t i = 0; i < outputs.size(); ++i) {
+      framework::Variable* grad = outputs[i];
      framework::Variable* orig_grad = origin_outputs[i];
-      AddTo(outputs[i], orig_grad);
+      AddTo(grad, orig_grad);
+      delete grad;
    }
  }
  return input_vars_;
@@ -173,7 +182,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
 void VarBase::RunBackward() {
  if (!pre_op_) return;
-  auto grads_t = grads_->GetMutable<framework::LoDTensor>();
+  VLOG(3) << "start backward";
+  auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
  std::fill(data, data + grads_t->numel(), 1.0);
@@ -183,5 +193,65 @@ void VarBase::RunBackward() {
  Autograd().RunBackward(this);
 }
+void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
+  py_funcs_[func_id] = py_func;
+}
+int PyLayer::NumFuncs() { return py_funcs_.size(); }
+std::vector<VarBase*> PyLayer::Apply(int func_id,
+                                     const std::vector<VarBase*>& inputs) {
+  std::vector<framework::Variable*> invars;
+  for (const VarBase* in : inputs) {
+    invars.push_back(in->var_);
+  }
+  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
+  std::vector<Variable*> outvars = CallPythonFunc(py_funcs_[func_id], invars);
+  std::vector<VarBase*> ret;
+  for (Variable* v : outvars) {
+    ret.push_back(new VarBase(v, new VarBase(true)));
+  }
+  return ret;
+}
+std::vector<Variable*> PyLayer::ApplyGrad(
+    int func_id, const std::vector<framework::Variable*>& inputs) {
+  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
+  return CallPythonFunc(py_funcs_[func_id], inputs);
+}
+std::vector<framework::Variable*> PyLayer::CallPythonFunc(
+    const py::object& callable, const std::vector<framework::Variable*>& ins) {
+  py::gil_scoped_acquire guard;
+  py::tuple in_args(ins.size());
+  for (size_t i = 0; i < ins.size(); ++i) {
+    const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
+    in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
+  }
+  VLOG(3) << "pyfunc in " << py::len(in_args);
+  // TODO(panyx0718): Who owns the returned LoDTensor.
+  auto ret = callable(in_args);
+  auto ret_tuple = py::cast<py::tuple>(ret);
+  size_t ret_num = py::len(ret_tuple);
+  std::vector<framework::Variable*> outs;
+  VLOG(3) << "pyfunc out " << ret_num;
+  for (size_t i = 0; i < ret_num; ++i) {
+    try {
+      auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
+      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
+                              "Output tensor %d should not be nullptr", i);
+      auto* var = new framework::Variable();
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      tensor->ShareDataWith(*py_out_tensor);
+      tensor->set_lod(py_out_tensor->lod());
+      outs.push_back(var);
+    } catch (py::cast_error&) {
+      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+    }
+  }
+  return outs;
+}
 }  // namespace imperative
 }  // namespace paddle
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -14,17 +14,26 @@
 #pragma once
-#include <map>
+// clang-format off
-#include <string>
+#include "paddle/fluid/framework/python_headers.h"
-#include <vector>
+// clang-format on
+#include <map>     // NOLINT
+#include <string>  // NOLINT
+#include <vector>  // NOLINT
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/imperative/type_defs.h"
 namespace paddle {
 namespace imperative {
+namespace py = ::pybind11;
 class PreparedOp {
 public:
  PreparedOp(const framework::OperatorBase& op,
@@ -77,31 +86,50 @@ class PreparedOp {
  framework::OperatorWithKernel::OpKernelFunc func;
  platform::DeviceContext* dev_ctx;
 };
 class OpBase;
+/* The wrapper for Variable which holds a Variable and a VarBase of its
+ * gradient. This object should be managed totally by Python intepreter.
+ *
+ * Nearly all interface should be implemented in C++.
+ */
 class VarBase {
 public:
-  VarBase()
+  VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {}
+  // Owns `var` and `grad`
+  VarBase(framework::Variable* var, VarBase* grad)
      : pre_op_(nullptr),
+        pre_op_out_name_(),
        pre_op_out_idx_(-1),
        var_desc_(nullptr),
-        var_(new framework::Variable()),
+        var_(var),
-        grads_(new framework::Variable()),
+        grads_(grad),
        stop_gradient_(false) {}
  explicit VarBase(bool stop_gradient)
      : pre_op_(nullptr),
+        pre_op_out_name_(),
        pre_op_out_idx_(-1),
        var_desc_(nullptr),
        var_(new framework::Variable()),
-        grads_(new framework::Variable()),
+        grads_(stop_gradient ? nullptr : new VarBase(true)),
        stop_gradient_(stop_gradient) {}
-  virtual ~VarBase() {}
+  virtual ~VarBase() {
+    if (var_) {
+      delete var_;
+    }
+    if (grads_) {
+      delete grads_;
+    }
+  }
  void RunBackward();
-  framework::LoDTensor& Grad();
+  framework::LoDTensor& GradValue();
  inline std::string GradName() const {
    PADDLE_ENFORCE(
@@ -115,15 +143,23 @@ class VarBase {
  int pre_op_out_idx_;
  framework::VarDesc* var_desc_;
  framework::Variable* var_;
-  framework::Variable* grads_;
+  VarBase* grads_;
  bool stop_gradient_;
 };
+/* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its
+ * gradient. This object should be managed totally by Python intepreter.
+ */
 class OpBase {
 public:
-  OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
+  OpBase()
+      : op_desc_(nullptr),
+        forward_id_(-1),
+        grad_op_desc_(nullptr),
+        backward_id_(-1) {}
  virtual ~OpBase() {
    if (grad_op_desc_) delete grad_op_desc_;
@@ -131,16 +167,22 @@ class OpBase {
  std::map<std::string, std::vector<VarBase*>> ApplyGrad();
+  // One of `op_desc_` or `forward_id_` is set, not both.
+  // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
  framework::OpDesc* op_desc_;
+  int forward_id_;
+  // When has backward, one of `grad_op_desc_` or `backward_id_` is set,
+  // not both.
  framework::OpDesc* grad_op_desc_;
+  int backward_id_;
-  std::map<std::string, std::vector<VarBase*>> input_vars_;
+  VarBasePtrMap input_vars_;
-  std::map<std::string, std::vector<VarBase*>> output_vars_;
+  VarBasePtrMap output_vars_;
-  std::map<std::string, std::vector<OpBase*>> pre_ops_;
+  OpBasePtrMap pre_ops_;
  std::map<std::string, std::vector<int>> pre_ops_out_idx_;
-  std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
+  framework::VariableValueMap grad_input_vars_;
-  std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
+  framework::VariableValueMap grad_output_vars_;
  framework::BlockDesc* block_;
 };
@@ -152,8 +194,25 @@ class Layer {
    std::vector<VarBase> vars;
    return vars;
  }
+};
+class PyLayer {
+ public:
+  virtual ~PyLayer() {}
+  static void RegisterFunc(int func_id, const py::object& py_func);
+  static int NumFuncs();
+  static std::vector<VarBase*> Apply(int func_id,
+                                     const std::vector<VarBase*>& inputs);
+  static std::vector<framework::Variable*> ApplyGrad(
+      int func_id, const std::vector<framework::Variable*>& inputs);
-  virtual void Backward() { LOG(ERROR) << "To support customize"; }
+ private:
+  static std::vector<framework::Variable*> CallPythonFunc(
+      const py::object& callable, const std::vector<framework::Variable*>& ins);
 };
 }  // namespace imperative

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -15,5 +15,199 @@
 #include "paddle/fluid/imperative/tracer.h"
 namespace paddle {
-namespace imperative {}  // namespace imperative
+namespace imperative {
+void CreateGradOp(const framework::OpDesc& op_desc,
+                  const std::unordered_set<std::string>& no_grad_set,
+                  const std::vector<framework::BlockDesc*>& grad_sub_block,
+                  framework::OpDesc** grad_op_desc,
+                  std::unordered_map<std::string, std::string>* grad_to_var) {
+  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+      framework::OpInfoMap::Instance()
+          .Get(op_desc.Type())
+          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
+  // TODO(panyx0718): Leak?
+  *grad_op_desc = grad_op_descs[0].release();
+}
+void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+  auto& var_t = var->Get<framework::LoDTensor>();
+  float* data =
+      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+          var_t.dims(), platform::CPUPlace());
+  std::fill(data, data + var_t.numel(), 0.0);
+}
+void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
+                   const VarBasePtrMap& outputs, framework::BlockDesc* block,
+                   const bool stop_gradient) {
+  std::map<std::string, VarBase*> vars;
+  framework::OpDesc* op_desc = op->op_desc_;
+  VLOG(3) << "tracer tracing " << op_desc->Type();
+  op_desc->InferShape(*block);
+  op_desc->InferVarType(block);
+  std::unique_ptr<framework::OperatorBase> op_base =
+      framework::OpRegistry::CreateOp(*op_desc);
+  framework::VariableValueMap invars_map;
+  framework::VariableValueMap outvars_map;
+  op->input_vars_ = inputs;
+  for (auto it : op->input_vars_) {
+    auto& invars = invars_map[it.first];
+    for (VarBase* inp : it.second) {
+      PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
+                              op->op_desc_->Type(), inp->var_desc_->Name());
+      invars.push_back(inp->var_);
+      vars[inp->var_desc_->Name()] = inp;
+      if (inp->pre_op_) {
+        op->pre_ops_[it.first].push_back(inp->pre_op_);
+        op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
+      } else {
+        op->pre_ops_[it.first].push_back(nullptr);
+      }
+      VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
+              << inp->var_->IsInitialized();
+    }
+  }
+  op->output_vars_ = outputs;
+  for (auto it : op->output_vars_) {
+    auto& outvars = outvars_map[it.first];
+    const std::vector<VarBase*>& outputs = it.second;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      VarBase* out = outputs[i];
+      outvars.push_back(out->var_);
+      vars[out->var_desc_->Name()] = out;
+      framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
+      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+        out->var_->GetMutable<framework::LoDTensor>();
+      } else {
+        LOG(ERROR) << "tracer doesn't support yet";
+      }
+      out->stop_gradient_ = stop_gradient;
+      out->pre_op_ = op;
+      out->pre_op_out_name_ = it.first;
+      out->pre_op_out_idx_ = i;
+      VLOG(3) << "output vname " << out->var_desc_->Name() << " "
+              << out->var_->IsInitialized();
+    }
+  }
+  VLOG(3) << "tracer running " << op_desc->Type();
+  framework::RuntimeContext ctx(invars_map, outvars_map);
+  // TODO(panyx0718): Cache p.
+  framework::OperatorWithKernel* op_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
+  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+  framework::Scope scope;
+  platform::CPUPlace place;
+  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+  p.op.RuntimeInferShape(scope, place, ctx);
+  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+  if (!stop_gradient) {
+    framework::OpDesc* grad_op_desc;
+    // TODO(panyx): Is this leaked?
+    std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
+        new std::unordered_map<std::string, std::string>());
+    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get());
+    op->grad_op_desc_ = grad_op_desc;
+    for (auto it : grad_op_desc->Inputs()) {
+      auto& grad_in_vars = op->grad_input_vars_[it.first];
+      for (const std::string& grad_invar : it.second) {
+        block->FindRecursiveOrCreateVar(grad_invar);
+        auto var_it = grad_to_var->find(grad_invar);
+        if (var_it == grad_to_var->end()) {
+          auto fwd_var_it = vars.find(grad_invar);
+          PADDLE_ENFORCE(fwd_var_it != vars.end());
+          // Forward inputs or outputs.
+          grad_in_vars.push_back(fwd_var_it->second->var_);
+        } else {
+          VarBase* var = vars[var_it->second];
+          if (!var->grads_->var_->IsInitialized()) {
+            InitVar(var->var_, var->grads_->var_);
+          }
+          // Douts.
+          grad_in_vars.push_back(var->grads_->var_);
+        }
+      }
+    }
+    for (auto it : grad_op_desc->Outputs()) {
+      auto& grad_out_vars = op->grad_output_vars_[it.first];
+      for (const std::string& grad_outvar : it.second) {
+        block->FindRecursiveOrCreateVar(grad_outvar);
+        auto var_it = grad_to_var->find(grad_outvar);
+        PADDLE_ENFORCE(var_it != grad_to_var->end());
+        VarBase* var = vars[var_it->second];
+        if (!var->grads_->var_->IsInitialized()) {
+          InitVar(var->var_, var->grads_->var_);
+        }
+        grad_out_vars.push_back(var->grads_->var_);
+      }
+    }
+  }
+  op->block_ = block;
+}
+std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
+                                      const std::vector<VarBase*>& inputs,
+                                      bool stop_gradient) {
+  VLOG(3) << "py_trace";
+  op->input_vars_["X"] = inputs;
+  op->output_vars_["Out"] = PyLayer::Apply(op->forward_id_, inputs);
+  for (VarBase* inp : inputs) {
+    if (inp->pre_op_) {
+      op->pre_ops_["X"].push_back(inp->pre_op_);
+      op->pre_ops_out_idx_["X"].push_back(inp->pre_op_out_idx_);
+    } else {
+      op->pre_ops_["X"].push_back(nullptr);
+    }
+  }
+  auto& outputs = op->output_vars_["Out"];
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    VarBase* out = outputs[i];
+    out->stop_gradient_ = stop_gradient;
+    out->pre_op_ = op;
+    out->pre_op_out_name_ = "Out";
+    out->pre_op_out_idx_ = i;
+  }
+  if (!stop_gradient) {
+    auto& grad_input_vars = op->grad_input_vars_["X@GRAD"];
+    auto& grad_output_vars = op->grad_output_vars_["Out@GRAD"];
+    for (const VarBase* inp : inputs) {
+      grad_input_vars.push_back(inp->var_);
+    }
+    for (VarBase* out : outputs) {
+      grad_input_vars.push_back(out->var_);
+    }
+    for (VarBase* out : outputs) {
+      grad_input_vars.push_back(out->grads_->var_);
+      if (!grad_input_vars.back()->IsInitialized()) {
+        InitVar(out->var_, grad_input_vars.back());
+      }
+    }
+    for (const VarBase* inp : inputs) {
+      grad_output_vars.push_back(inp->grads_->var_);
+      if (!grad_output_vars.back()->IsInitialized()) {
+        InitVar(inp->var_, grad_output_vars.back());
+      }
+    }
+  }
+  return outputs;
+}
+}  // namespace imperative
 }  // namespace paddle
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -30,23 +30,9 @@ void CreateGradOp(const framework::OpDesc& op_desc,
                  const std::unordered_set<std::string>& no_grad_set,
                  const std::vector<framework::BlockDesc*>& grad_sub_block,
                  framework::OpDesc** grad_op_desc,
-                  std::unordered_map<std::string, std::string>* grad_to_var) {
+                  std::unordered_map<std::string, std::string>* grad_to_var);
-  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
-      framework::OpInfoMap::Instance()
-          .Get(op_desc.Type())
-          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
-  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
-  // TODO(panyx0718): Leak?
-  *grad_op_desc = grad_op_descs[0].release();
-}
-void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+void InitVar(framework::Variable* var, framework::Variable* grad_var);
-  auto& var_t = var->Get<framework::LoDTensor>();
-  float* data =
-      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
-          var_t.dims(), platform::CPUPlace());
-  std::fill(data, data + var_t.numel(), 0.0);
-}
 class Tracer {
 public:
@@ -57,120 +43,10 @@ class Tracer {
  void Trace(OpBase* op,
             const std::map<std::string, std::vector<VarBase*>>& inputs,
             const std::map<std::string, std::vector<VarBase*>>& outputs,
-             framework::BlockDesc* block, const bool stop_gradient = false) {
+             framework::BlockDesc* block, const bool stop_gradient = false);
-    std::map<std::string, VarBase*> vars;
-    framework::OpDesc* op_desc = op->op_desc_;
+  std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
-    VLOG(3) << "tracer tracing " << op_desc->Type();
+                                bool stop_gradient = false);
-    op_desc->InferShape(*block);
-    op_desc->InferVarType(block);
-    std::unique_ptr<framework::OperatorBase> op_base =
-        framework::OpRegistry::CreateOp(*op_desc);
-    framework::VariableValueMap invars_map;
-    framework::VariableValueMap outvars_map;
-    op->input_vars_ = inputs;
-    for (auto it : op->input_vars_) {
-      auto& invars = invars_map[it.first];
-      for (VarBase* inp : it.second) {
-        PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
-                                op->op_desc_->Type(), inp->var_desc_->Name());
-        invars.push_back(inp->var_);
-        vars[inp->var_desc_->Name()] = inp;
-        if (inp->pre_op_) {
-          op->pre_ops_[it.first].push_back(inp->pre_op_);
-          op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
-        } else {
-          op->pre_ops_[it.first].push_back(nullptr);
-        }
-        VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
-                << inp->var_->IsInitialized();
-      }
-    }
-    op->output_vars_ = outputs;
-    for (auto it : op->output_vars_) {
-      auto& outvars = outvars_map[it.first];
-      const std::vector<VarBase*>& outputs = it.second;
-      for (size_t i = 0; i < outputs.size(); ++i) {
-        VarBase* out = outputs[i];
-        outvars.push_back(out->var_);
-        vars[out->var_desc_->Name()] = out;
-        framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
-        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-          out->var_->GetMutable<framework::LoDTensor>();
-        } else {
-          LOG(ERROR) << "tracer doesn't support yet";
-        }
-        out->stop_gradient_ = stop_gradient;
-        out->pre_op_ = op;
-        out->pre_op_out_name_ = it.first;
-        out->pre_op_out_idx_ = i;
-        VLOG(3) << "output vname " << out->var_desc_->Name() << " "
-                << out->var_->IsInitialized();
-      }
-    }
-    VLOG(3) << "tracer running " << op_desc->Type();
-    framework::RuntimeContext ctx(invars_map, outvars_map);
-    // TODO(panyx0718): Cache p.
-    framework::OperatorWithKernel* op_kernel =
-        dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
-    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
-    framework::Scope scope;
-    platform::CPUPlace place;
-    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-    p.op.RuntimeInferShape(scope, place, ctx);
-    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
-    if (!stop_gradient) {
-      framework::OpDesc* grad_op_desc;
-      auto grad_to_var = new std::unordered_map<std::string, std::string>();
-      CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
-      op->grad_op_desc_ = grad_op_desc;
-      for (auto it : grad_op_desc->Inputs()) {
-        auto& grad_in_vars = op->grad_input_vars_[it.first];
-        for (const std::string& grad_invar : it.second) {
-          block->FindRecursiveOrCreateVar(grad_invar);
-          auto var_it = grad_to_var->find(grad_invar);
-          if (var_it == grad_to_var->end()) {
-            auto fwd_var_it = vars.find(grad_invar);
-            PADDLE_ENFORCE(fwd_var_it != vars.end());
-            grad_in_vars.push_back(fwd_var_it->second->var_);
-          } else {
-            VarBase* var = vars[var_it->second];
-            if (!var->grads_->IsInitialized()) {
-              InitVar(var->var_, var->grads_);
-            }
-            grad_in_vars.push_back(var->grads_);
-          }
-        }
-      }
-      for (auto it : grad_op_desc->Outputs()) {
-        auto& grad_out_vars = op->grad_output_vars_[it.first];
-        for (const std::string& grad_outvar : it.second) {
-          block->FindRecursiveOrCreateVar(grad_outvar);
-          auto var_it = grad_to_var->find(grad_outvar);
-          PADDLE_ENFORCE(var_it != grad_to_var->end());
-          VarBase* var = vars[var_it->second];
-          if (!var->grads_->IsInitialized()) {
-            InitVar(var->var_, var->grads_);
-          }
-          grad_out_vars.push_back(var->grads_);
-        }
-      }
-    }
-    op->block_ = block;
-  }
 private:
  framework::BlockDesc* root_block_;

--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+namespace paddle {
+namespace imperative {
+class VarBase;
+class OpBase;
+typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap;
+typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
       i++) {
    LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
              << " result: " << result[i];
-    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+    EXPECT_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
-                   result[i]);
+                1e-3);
  }
 }

--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
        ir_graph_build_pass
        ir_analysis_pass
        analysis_passes
+        subgraph_detector
        CACHE INTERNAL "")
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
  use_tensorrt_ = true;
  tensorrt_workspace_size_ = workspace_size;
  tensorrt_max_batchsize_ = max_batch_size;
+  Update();
 }
 void contrib::AnalysisConfig::Update() {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc;
 using framework::NaiveExecutor;
 using contrib::AnalysisConfig;
-/* This predictor is based on the original native predictor with IR and Analysis
+/** \brief This predictor is based on the original native predictor with IR and
- * support. It will optimize IR and Parameters in the runtime.
+ * Analysis support.
+ *
+ * It will optimize IR and Parameters in the runtime.
+ *
 * TODO(Superjomn) Replace the Navive predictor?
 */
 class AnalysisPredictor : public PaddlePredictor {

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -92,10 +92,10 @@ if(WITH_MKL)
  if(NOT WIN32)
    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
                 ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
-  else(WIN32)
+  else()
    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX}
            ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX})
-  endif(WIN32)
+  endif()
  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
  if(EXISTS ${MKLDNN_PATH})
    include_directories("${MKLDNN_PATH}/include")
@@ -128,8 +128,8 @@ else()
      ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
      ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
      snappystream ${EXTERNAL_LIB})
-  # NOTE(dzhwinter) shlwapi is deprecated.
+  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  set(DEPS ${DEPS} libcmt shlwapi)
+  set(DEPS ${DEPS} libcmt ${os_dependency_modules})
 endif(NOT WIN32)
 if(WITH_GPU)

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -116,6 +116,10 @@ D
      --modeldir=$DATA_DIR/mobilenet/model \
      --data=$DATA_DIR/mobilenet/data.txt \
      --refer=$DATA_DIR/mobilenet/result.txt 
+    if [ $? -ne 0 ]; then
+      echo "trt demo trt_mobilenet_demo runs fail."
+      exit 1
+    fi
  fi
 done
 set +x
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -38,8 +38,8 @@ void Main() {
  std::unique_ptr<PaddlePredictor> predictor;
  paddle::contrib::AnalysisConfig config;
  config.EnableUseGpu(100, 0);
-  config.SetModel(FLAGS_modeldir + "/__params__",
+  config.SetModel(FLAGS_modeldir + "/__model__",
-                  FLAGS_modeldir + "/__model__");
+                  FLAGS_modeldir + "/__params__");
  config.EnableTensorRtEngine();
  predictor = CreatePaddlePredictor(config);

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -204,11 +204,14 @@ static std::string DescribeTensor(const PaddleTensor &tensor) {
    os << to_string(l) << "; ";
  }
  os << "\n";
-  os << " - data: ";
+  os << " - memory length: " << tensor.data.length();
+  os << "\n";
+  os << " - data: ";
  int dim = VecReduceToInt(tensor.shape);
+  float *pdata = static_cast<float *>(tensor.data.data());
  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
+    os << pdata[i] << " ";
  }
  os << '\n';
  return os.str();
@@ -224,10 +227,12 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
    os << to_string(l) << "; ";
  }
  os << "\n";
-  os << " - data: ";
  PaddlePlace place;
  int size;
  const auto *data = tensor.data<float>(&place, &size);
+  os << " - numel: " << size;
+  os << "\n";
+  os << " - data: ";
  for (int i = 0; i < size; i++) {
    os << data[i] << " ";
  }

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -19,6 +19,8 @@
 #include <unordered_set>
 #include <vector>
+/*! \file */
 // Here we include some header files with relative paths, for that in deploy,
 // the abstract path of this header file will be changed.
 #include "paddle_api.h"           // NOLINT
@@ -41,49 +43,125 @@ struct AnalysisConfig {
  explicit AnalysisConfig(const std::string& prog_file,
                          const std::string& params_file);
-  // Model path related.
+  /** Set model with a directory.
+   */
  void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
+  /** Set model with two specific pathes for program and parameters.
+   */
  void SetModel(const std::string& prog_file_path,
                const std::string& params_file_path);
+  /** Set program file path.
+   */
  void SetProgFile(const std::string& x) { prog_file_ = x; }
+  /** Set parameter composed file path.
+   */
  void SetParamsFile(const std::string& x) { params_file_ = x; }
+  /** Get the model directory path.
+   */
  const std::string& model_dir() const { return model_dir_; }
+  /** Get the program file path.
+   */
  const std::string& prog_file() const { return prog_file_; }
+  /** Get the composed parameters file.
+   */
  const std::string& params_file() const { return params_file_; }
  // GPU related.
+  /**
+   * \brief Turn on GPU.
+   * @param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
+   * @param device_id the GPU card to use (default is 0).
+   */
  void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
+  /** Turn off the GPU.
+   */
  void DisableGpu();
+  /** A bool state telling whether the GPU is turned on.
+   */
  bool use_gpu() const { return use_gpu_; }
+  /** Get the GPU device id.
+   */
  int gpu_device_id() const { return device_id_; }
+  /** Get the initial size in MB of the GPU memory pool.
+   */
  int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
+  /** Get the proportion of the initial memory pool size compared to the device.
+   */
  float fraction_of_gpu_memory_for_pool() const;
-  // Determine whether to perform graph optimization.
+  /** \brief Control whether to perform IR graph optimization.
+   *
+   * If turned off, the AnalysisConfig will act just like a NativeConfig.
+   */
  void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
+  /** A boolean state tell whether the ir graph optimization is actived.
+   */
  bool ir_optim() const { return enable_ir_optim_; }
+  /** \brief INTERNAL Determine whether to use the feed and fetch operators.
+   * Just for internal development, not stable yet.
+   * When ZeroCopyTensor is used, this should turned off.
+   */
  void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
+  /** A boolean state telling whether to use the feed and fetch operators.
+   */
  bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
+  /** \brief Control whether to specify the inputs' names.
+   *
+   * The PaddleTensor type has a `name` member, assign it with the corresponding
+   * variable name. This is used only when the input PaddleTensors passed to the
+   * `PaddlePredictor.Run(...)` cannot follow the order in the training phase.
+   */
  void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
+  /** A boolean state tell whether the input PaddleTensor names specified should
+   * be used to reorder the inputs in `PaddlePredictor.Run(...)`.
+   */
  bool specify_input_name() const { return specify_input_name_; }
+  /**
+   * \brief Turn on the TensorRT engine.
+   *
+   * The TensorRT engine will accelerate some subgraphes in the original Fluid
+   * computation graph. In some models such as TensorRT50, GoogleNet and so on,
+   * it gains significant performance acceleration.
+   *
+   * @param workspace_size the memory size(in byte) used for TensorRT workspace.
+   * @param max_batch_size the maximum batch size of this prediction task,
+   * better set as small as possible, or performance loss.
+   * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
+   * subgraph is less than this, it will not transfer to TensorRT engine.
+   */
  void EnableTensorRtEngine(int workspace_size = 1 << 20,
                            int max_batch_size = 1, int min_subgraph_size = 3);
+  /** A boolean state telling whether the TensorRT engine is used.
+   */
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
+  /** Control whther to debug IR graph analysis phase.
+   */
  void SwitchIrDebug(int x = true) { ir_debug_ = x; }
+  /** Turn on MKLDNN.
+   */
  void EnableMKLDNN();
+  /** A boolean state telling whether to use the MKLDNN.
+   */
  bool mkldnn_enabled() const { return use_mkldnn_; }
-  // Set and get the number of cpu math library threads.
+  /** Set and get the number of cpu math library threads.
+   */
  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
+  /** An int state telling how many threads are used in the CPU math library.
+   */
  int cpu_math_library_num_threads() const {
    return cpu_math_library_num_threads_;
  }
+  /** Transform the AnalysisConfig to NativeConfig.
+   */
  NativeConfig ToNativeConfig() const {
    NativeConfig config;
    config.model_dir = model_dir_;
@@ -95,19 +173,30 @@ struct AnalysisConfig {
    config.specify_input_name = specify_input_name_;
    return config;
  }
+  /** Specify the operator type list to use MKLDNN acceleration.
+   * @param op_list the operator type list.
+   */
  void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
    mkldnn_enabled_op_types_ = op_list;
  }
-  // Specify the memory buffer of program and parameter
+  /** Specify the memory buffer of program and parameter
+   * @param prog_buffer the memory buffer of program.
+   * @param prog_buffer_size the size of the data.
+   * @param params_buffer the memory buffer of the composed parameters file.
+   * @param params_buffer_size the size of the commposed parameters data.
+   */
  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
-                      const char* program_buffer, size_t program_buffer_size);
+                      const char* params_buffer, size_t params_buffer_size);
+  /** A boolean state telling whether the model is set from the CPU memory.
+   */
  bool model_from_memory() const { return model_from_memory_; }
  friend class ::paddle::AnalysisPredictor;
-  // NOTE just for developer, not an official API, easily to be broken.
+  /** NOTE just for developer, not an official API, easily to be broken.
-  // Get a pass builder for customize the passes in IR analysis phase.
+   * Get a pass builder for customize the passes in IR analysis phase.
+   */
  PassStrategy* pass_builder() const;
 protected:

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -13,61 +13,76 @@
 // limitations under the License.
 #pragma once
+/*! \file paddle_api.h
+ */
 #include <cassert>
 #include <memory>
 #include <string>
 #include <vector>
+/*! \namespace paddle
+ */
 namespace paddle {
-// Data type.
+/** paddle data type.
+ */
 enum PaddleDType {
  FLOAT32,
  INT64,
  // TODO(Superjomn) support more data types if needed.
 };
-/*
+/**
- * Memory menage for PaddleTensor.
+ *\brief Memory menager for PaddleTensor.
- * The PaddleBuf holds a buffer for data input or output. The memory can be
- * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- * should be reused for better performance.
 *
- * For user allocated memory, the following API can be used:
+ *The PaddleBuf holds a buffer for data input or output. The memory can be
- * - PaddleBuf(void* data, size_t length) to set an external memory by
+ *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- * specifying
+ *should be reused for better performance.
- *   the memory address and length.
- * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
- * memory.
- * ATTENTION, for user allocated memory, deallocation should be done by users
- * externally after the program finished. The PaddleBuf won't do any allocation
- * or deallocation.
 *
- * To have the PaddleBuf allocate and manage the memory:
+ *For user allocated memory, the following API can be used:
- * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ *- PaddleBuf(void* data, size_t length) to set an external memory by
- * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ *specifying
- *   if the allocated memory is larger than `length`, nothing will done.
+ *  the memory address and length.
+ *- Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ *memory.
+ *ATTENTION, for user allocated memory, deallocation should be done by users
+ *externally after the program finished. The PaddleBuf won't do any allocation
+ *or deallocation.
+ *
+ *To have the PaddleBuf allocate and manage the memory:
+ *- PaddleBuf(size_t length) will allocate a memory of size `length`.
+ *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ *  if the allocated memory is larger than `length`, nothing will done.
 */
 class PaddleBuf {
 public:
-  // PaddleBuf allocate memory internally, and manage it.
+  /** PaddleBuf allocate memory internally, and manage it.
+   */
  explicit PaddleBuf(size_t length)
      : data_(new char[length]), length_(length), memory_owned_(true) {}
-  // Set external memory, the PaddleBuf won't manage it.
+  /** Set external memory, the PaddleBuf won't manage it.
+   */
  PaddleBuf(void* data, size_t length)
      : data_(data), length_(length), memory_owned_{false} {}
-  // Copy only available when memory is managed externally.
+  /** Copy only available when memory is managed externally.
+   */
  explicit PaddleBuf(const PaddleBuf&);
-  // Resize the memory.
+  /** Resize the memory.
+   */
  void Resize(size_t length);
-  // Reset to external memory, with address and length set.
+  /** Reset to external memory, with address and length set.
+   */
  void Reset(void* data, size_t length);
-  // Tell whether the buffer is empty.
+  /** Tell whether the buffer is empty.
+   */
  bool empty() const { return length_ == 0; }
-  // Get the memory address.
+  /** Get the memory address.
+   */
  void* data() const { return data_; }
-  // Get the memory length.
+  /** Get the memory length.
+   */
  size_t length() const { return length_; }
  ~PaddleBuf() { Free(); }
@@ -83,7 +98,8 @@ class PaddleBuf {
  bool memory_owned_{true};
 };
-// Basic input and output data structure for PaddlePredictor.
+/** Basic input and output data structure for PaddlePredictor.
+ */
 struct PaddleTensor {
  PaddleTensor() = default;
  std::string name;  // variable name.
@@ -94,19 +110,23 @@ struct PaddleTensor {
 };
 enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-// Tensor without copy, currently only supports AnalysisPredictor.
+/** Tensor without copy, currently only supports AnalysisPredictor.
+ */
 class ZeroCopyTensor {
 public:
  void Reshape(const std::vector<int>& shape);
-  // Get the memory in CPU or GPU with specific data type, should Reshape first
+  /** Get the memory in CPU or GPU with specific data type, should Reshape first
-  // to tell the data size.
+   * to tell the data size.
-  // Once can directly call this data to feed the data.
+   * Once can directly call this data to feed the data.
-  // This is for write the input tensor.
+   * This is for write the input tensor.
+   */
  template <typename T>
  T* mutable_data(PaddlePlace place);
-  // Get the memory directly, will return the place and memory size by pointer.
+  /** Get the memory directly, will return the place and element size by
-  // This is for reading the output tensor.
+   * pointer.
+   * This is for reading the output tensor.
+   */
  template <typename T>
  T* data(PaddlePlace* place, int* size) const;
@@ -128,8 +148,7 @@ class ZeroCopyTensor {
  void* scope_{nullptr};
 };
-/*
+/** A simple Inference API for Paddle.
- * A simple Inference API for Paddle.
 */
 class PaddlePredictor {
 public:
@@ -138,18 +157,20 @@ class PaddlePredictor {
  PaddlePredictor(const PaddlePredictor&) = delete;
  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
-  // Predict an record.
+  /** Predict an record.
-  // The caller should be responsible for allocating and releasing the memory of
+   * The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be available until Run returns. Caller should be
+   * `inputs`. `inputs` should be available until Run returns. Caller should be
-  // responsible for the output tensor's buffer, either allocated or passed from
+   * responsible for the output tensor's buffer, either allocated or passed from
-  // outside.
+   * outside.
+   */
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;
-  // Zero copy input and output optimization.
+  /** Zero copy input and output optimization.
-  // Get the input or output tensors, and operate on their memory directly,
+   * Get the input or output tensors, and operate on their memory directly,
-  // without copy.
+   * without copy.
+   */
  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
      const std::string& name) {
    return nullptr;
@@ -160,16 +181,19 @@ class PaddlePredictor {
  }
  virtual bool ZeroCopyRun() { return false; }
-  // Clone a predictor that share the model weights, the Cloned predictor should
+  /** Clone a predictor that share the model weights, the Cloned predictor
-  // be thread-safe.
+   * should be thread-safe.
+   */
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
-  // Destroy the Predictor.
+  /** Destroy the Predictor.
+   */
  virtual ~PaddlePredictor() = default;
-  // The common configs for all the predictors.
+  /** The common configs for all the predictors.
+   */
  struct Config {
-    std::string model_dir;  // path to the model directory.
+    std::string model_dir; /*!< path to the model directory. */
  };
 };
@@ -177,17 +201,21 @@ struct NativeConfig : public PaddlePredictor::Config {
  // GPU related fields.
  bool use_gpu{false};
  int device{0};
-  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
+  float fraction_of_gpu_memory{
+      -1.f}; /*!< Change to a float in (0,1] if needed. */
  // Specify the exact path of program and parameter files.
  std::string prog_file;
  std::string param_file;
-  // Specify the variable's name of each input if input tensors don't follow the
+  /** Specify the variable's name of each input if input tensors don't follow
-  // `feeds` and `fetches` of the phase `save_inference_model`.
+   * the
+   * `feeds` and `fetches` of the phase `save_inference_model`.
+   */
  bool specify_input_name{false};
-  // Set and get the number of cpu math library threads.
+  /** Set and get the number of cpu math library threads.
+   */
  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
  }
@@ -201,28 +229,33 @@ struct NativeConfig : public PaddlePredictor::Config {
  int cpu_math_library_num_threads_{1};
 };
-// A factory to help create different predictors.
+/*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
-//
+ * config);
-// Usage:
+ *
-//
+ * \brief A factory to help create different predictors.
-// NativeConfig config;
+ *
-// ... // change the configs.
+ * Usage:
-// auto native_predictor = CreatePaddlePredictor(config);
+ *
-//
+ * NativeConfig config;
-// FOR EXTENSION DEVELOPER:
+ * ... // change the configs.
-// Different predictors are designated by config type. Similar configs can be
+ * auto native_predictor = CreatePaddlePredictor(config);
-// merged, but there shouldn't be a huge config containing different fields for
+ *
-// more than one kind of predictors.
+ * FOR EXTENSION DEVELOPER:
+ * Different predictors are designated by config type. Similar configs can be
+ * merged, but there shouldn't be a huge config containing different fields for
+ * more than one kind of predictors.
+ */
 template <typename ConfigT>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-// NOTE The following APIs are too trivial, we will discard it in the following
+/** NOTE The following APIs are too trivial, we will discard it in the following
-// versions.
+ * versions.
+ */
 enum class PaddleEngineKind {
-  kNative = 0,         // Use the native Fluid facility.
+  kNative = 0,        /*!< Use the native Fluid facility. */
-  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+  kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */
-  kAnalysis,           // More optimization.
+  kAnalysis,          /*!< More optimization. */
-  kAnakin              // Use Anakin for inference, not mature yet.
+  kAnakin             /*!< Use Anakin for inference, not mature yet. */
 };
 template <typename ConfigT, PaddleEngineKind engine>

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -18,30 +18,39 @@
 #include <string>
 #include <vector>
+/*! \file */
+/*! \namespace paddle */
 namespace paddle {
-/*
- * This is a pass builder based on string. It is part of inference API.
+/** This is a pass builder based on string. It is part of inference API.
 */
 class PaddlePassBuilder {
 public:
  explicit PaddlePassBuilder(const std::vector<std::string> &passes)
      : passes_(passes) {}
+  /** Append a pass to the end of the passes. */
  void AppendPass(const std::string &pass_type);
+  /** Insert a pass to a specific position.
+   * @param idx the position to insert.
+   * @param pass_type the pass key.
+   */
  void InsertPass(size_t idx, const std::string &pass_type);
-  // Delete the `idx`-th pass.
+  /** Delete the `idx`-th pass. */
  void DeletePass(size_t idx);
-  // Delete all the passes that has type `pass_type`.
+  /** Delete all the passes that has type `pass_type`. */
  void DeletePass(const std::string &pass_type);
-  // Visualize the computation graph after each pass by generating a DOT
+  /** Visualize the computation graph after each pass by generating a DOT
-  // language file, one can draw them with the Graphviz toolkit.
+   * language file, one can draw them with the Graphviz toolkit.
+   */
  void TurnOnDebug();
-  // Human-readible information.
+  /** Human-readible information. */
  std::string DebugString();
  const std::vector<std::string> &AllPasses() const { return passes_; }
@@ -50,16 +59,16 @@ class PaddlePassBuilder {
  std::vector<std::string> passes_;
 };
-/*
+/**Pass strategy to help control the IR passes.
- * Pass strategy to help control the IR passes.
 */
 class PassStrategy : public PaddlePassBuilder {
 public:
  explicit PassStrategy(const std::vector<std::string> &passes)
      : PaddlePassBuilder(passes) {}
-  // The MKLDNN control exists in both CPU and GPU mode, because there can be
+  /** The MKLDNN control exists in both CPU and GPU mode, because there can be
-  // still some CPU kernels running in CPU mode.
+   * still some CPU kernels running in CPU mode.
+   */
  virtual void EnableMKLDNN() = 0;
  bool use_gpu() const { return use_gpu_; }
@@ -70,8 +79,7 @@ class PassStrategy : public PaddlePassBuilder {
  bool use_gpu_{false};
 };
-/*
+/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
- * The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
 */
 class CpuPassStrategy : public PassStrategy {
 public:
@@ -81,6 +89,7 @@ class CpuPassStrategy : public PassStrategy {
    passes_.assign({
        "infer_clean_graph_pass",         //
        "attention_lstm_fuse_pass",       //
+        "seqpool_concat_fuse_pass",       //
        "seqconv_eltadd_relu_fuse_pass",  //
        // "embedding_fc_lstm_fuse_pass", //
        "fc_lstm_fuse_pass",             //
@@ -117,8 +126,7 @@ class CpuPassStrategy : public PassStrategy {
  CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
 };
-/*
+/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
- * The GPU passes strategy, it is used in
 */
 class GpuPassStrategy : public PassStrategy {
 public:
@@ -133,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
        "conv_elementwise_add_fuse_pass",            //
    });
+    for (int i = 6; i >= 3; i--) {
+      passes_.push_back("transpose_flatten" + std::to_string(i) +
+                        "_concat_fuse_pass");
+    }
    use_gpu_ = true;
  }

--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
                  const framework::Scope& scope, bool test_mode) override {
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
+    nvinfer1::ILayer* layer = nullptr;
    framework::OpDesc op_desc(op, nullptr);
    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
                                         0};
    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                         0};
+    if (op_type_ == "add") {
+      nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Scale, *X, scale_mode, shift_weights.get(),
+          scale_weights.get(), power_weights.get());
+      layer = scale_layer;
+    } else if (op_type_ == "mul") {
+      nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Scale, *X, scale_mode, scale_weights.get(),
+          shift_weights.get(), power_weights.get());
+      layer = scale_layer;
+    }
-    nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
-        shift_weights.get(), scale_weights.get(), power_weights.get());
    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(
-    layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
+        ("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str());
    layer->getOutput(0)->setName(output_name.c_str());
    engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
    engine_->SetITensor(output_name, layer->getOutput(0));
@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
      engine_->DeclareOutput(output_name);
    }
  }
+ protected:
+  std::string op_type_;
 };
 class ElementwiseTensorOpConverter : public OpConverter {
@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
        {"max", nvinfer1::ElementWiseOperation::kMAX},
 };
+class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter {
+ public:
+  ElementwiseWeightAddOpConverter() { op_type_ = "add"; }
+};
+class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter {
+ public:
+  ElementwiseWeightMulOpConverter() { op_type_ = "mul"; }
+};
 class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
 public:
  ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
 }  // namespace inference
 }  // namespace paddle
-REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_add_weight,
+                          ElementwiseWeightAddOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight,
+                          ElementwiseWeightMulOpConverter);
 REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
                          ElementwiseTensorAddOpConverter);

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -100,14 +100,14 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
-inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
 if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
 endif()
-inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
+inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -283,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, 4 /* multi_thread */);
+                 input_slots_all, &outputs, 2 /* multi_thread */);
 }
 // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
@@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) {
  ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
  LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
-  int output_size{0};
+  int output_size{0};  // this is the number of elements not memory size
  auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
  auto *native_data = static_cast<float *>(native_outputs.front().data.data());
-  for (size_t i = 0; i < output_size / sizeof(float); i++) {
+  for (int i = 0; i < output_size; i++) {
    EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
  }
 }

--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -121,14 +121,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
  }
 }
-void SetConfig(AnalysisConfig *cfg) {
-  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
-  cfg->DisableGpu();
-  cfg->SwitchSpecifyInputNames();
-  cfg->pass_builder()->TurnOnDebug();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-}
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
  std::vector<PaddleTensor> input_slots;
@@ -141,15 +133,22 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
  }
 }
+void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->pass_builder()->TurnOnDebug();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+  if (use_mkldnn) {
+    cfg->EnableMKLDNN();
+  }
+}
 void profile(bool use_mkldnn = false) {
  AnalysisConfig cfg;
-  SetConfig(&cfg);
+  SetConfig(&cfg, use_mkldnn);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-  }
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
@@ -169,16 +168,165 @@ TEST(Analyzer_seq_pool1, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
-// Check the fuse status
+// Compare Deterministic result
-TEST(Analyzer_seq_pool1, fuse_statis) {
+TEST(Analyzer_seq_pool1, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+void analysis_fuse_statis(bool use_zerocopy) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg.SwitchUseFeedFetchOps(!use_zerocopy);
  int num_ops;
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
+  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
+  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
  LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 349);
+  EXPECT_EQ(num_ops, 195);
+}
+// Check the fuse status
+TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); }
+void PrepareZeroCopyInputs(
+    const std::unique_ptr<PaddlePredictor> &predictor,
+    std::vector<std::unique_ptr<ZeroCopyTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  // only feed one batch
+  const auto &one_batch = data.NextBatch();
+  inputs->clear();
+  for (size_t i = 0; i < one_batch.size(); ++i) {
+    auto &slot = one_batch[i];
+    auto tensor = predictor->GetInputTensor(slot.name + "_embed");
+    tensor->Reshape(slot.shape);
+    tensor->SetLoD({slot.lod});
+    ZeroCopyTensorAssignData<float>(tensor.get(), slot.data);
+    inputs->emplace_back(std::move(tensor));
+  }
+}
+// diff: similarity_norm.tmp_0, // speed: fc_4.tmp_1
+static const char out_var_name[] = "reduce_sum_0.tmp_0";
+// return the output values
+std::vector<float> zerocopy_profile(int repeat_times) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.SwitchUseFeedFetchOps(false);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
+  PrepareZeroCopyInputs(predictor, &inputs);
+  auto output_tensor = predictor->GetOutputTensor(out_var_name);
+  Timer timer;
+  LOG(INFO) << "Warm up run...";
+  timer.tic();
+  predictor->ZeroCopyRun();
+  PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1);
+  if (FLAGS_profile) {
+    paddle::platform::ResetProfiler();
+  }
+  LOG(INFO) << "Run " << repeat_times << " times...";
+  timer.tic();
+  for (int i = 0; i < repeat_times; i++) {
+    predictor->ZeroCopyRun();
+  }
+  PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times,
+            1);
+  LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
+  PaddlePlace place;
+  int output_size{0};
+  auto *pdata = output_tensor->data<float>(&place, &output_size);
+  std::vector<float> res(output_size);
+  for (int i = 0; i < output_size; ++i) {
+    res[i] = pdata[i];
+  }
+  return res;
+}
+TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); }
+TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.SwitchUseFeedFetchOps(false);
+  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  double total_time_of_threads{0};
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(base_predictor->Clone());
+    // predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  }
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    threads.emplace_back([config, &total_time_of_threads, &predictors, tid] {
+      auto &predictor = predictors[tid];
+      std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
+      PrepareZeroCopyInputs(predictor, &inputs);
+      auto output_tensor = predictor->GetOutputTensor(out_var_name);
+      Timer timer;
+      double total_time{0};
+      LOG(INFO) << "Warm up run...";
+      timer.tic();
+      predictor->ZeroCopyRun();
+      PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1);
+      if (FLAGS_profile) {
+        paddle::platform::ResetProfiler();
+      }
+      int repeat_times = FLAGS_repeat;
+      LOG(INFO) << "Run " << repeat_times << " times...";
+      timer.tic();
+      for (int i = 0; i < repeat_times; i++) {
+        predictor->ZeroCopyRun();
+      }
+      total_time += timer.toc();
+      total_time_of_threads += total_time;
+      LOG(INFO) << "thread time: " << total_time / repeat_times;
+    });
+  }
+  for (auto &t : threads) {
+    t.join();
+  }
+  LOG(INFO) << "average time: "
+            << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
+}
+TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); }
+TEST(Analyzer_seq_pool1, zerocopy_compare_native) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.SwitchUseFeedFetchOps(true);
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
+  std::vector<PaddleTensor> native_outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs));
+  EXPECT_EQ(native_outputs.size(), 1UL);
+  auto zerocopy_output = zerocopy_profile(1);
+  EXPECT_EQ(zerocopy_output.size() * sizeof(float),
+            native_outputs.front().data.length());
+  auto *native_data = static_cast<float *>(native_outputs.front().data.data());
+  for (size_t i = 0; i < zerocopy_output.size(); ++i) {
+    EXPECT_NEAR(zerocopy_output[i], native_data[i], 1e-3);
+  }
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -62,7 +62,7 @@ std::ostream &operator<<(std::ostream &os,
                         const contrib::AnalysisConfig &config) {
  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
  num_spaces++;
-  os << *reinterpret_cast<const NativeConfig *>(&config);
+  os << config.ToNativeConfig();
  if (!config.model_from_memory()) {
    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
    os << GenSpaces(num_spaces) << "param_file: " << config.params_file()

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -54,11 +54,13 @@ namespace paddle {
 namespace inference {
 void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
+  const auto *analysis_config =
+      reinterpret_cast<const contrib::AnalysisConfig *>(config);
  if (use_analysis) {
-    LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
+    LOG(INFO) << *analysis_config;
    return;
  }
-  LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config);
+  LOG(INFO) << analysis_config->ToNativeConfig();
 }
 void CompareResult(const std::vector<PaddleTensor> &outputs,
@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
    const PaddlePredictor::Config *config, bool use_analysis = true) {
+  const auto *analysis_config =
+      reinterpret_cast<const contrib::AnalysisConfig *>(config);
  if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig>(
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(*analysis_config);
-        *(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
  }
-  return CreatePaddlePredictor<NativeConfig>(
+  auto native_config = analysis_config->ToNativeConfig();
-      *(reinterpret_cast<const NativeConfig *>(config)));
+  return CreatePaddlePredictor<NativeConfig>(native_config);
 }
 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
@@ -310,13 +313,12 @@ void CompareDeterministic(
  int num_times = FLAGS_repeat;
  auto predictor = CreateTestPredictor(config, FLAGS_use_analysis);
-  // warmup run
  std::vector<PaddleTensor> warmup_outputs, outputs;
-  predictor->Run(inputs[0], &warmup_outputs, batch_size);
  // run num_times to Compare Deterministic Result.
-  for (int i = 0; i < num_times; i++) {
+  for (size_t j = 0; j < inputs.size(); j++) {
-    for (size_t j = 0; j < inputs.size(); j++) {
+    // warmup run
+    predictor->Run(inputs[j], &warmup_outputs, batch_size);
+    for (int i = 0; i < num_times; i++) {
      predictor->Run(inputs[j], &outputs, batch_size);
      CompareResult(outputs, warmup_outputs);
    }
@@ -328,10 +330,7 @@ void CompareNativeAndAnalysis(
    const std::vector<std::vector<PaddleTensor>> &inputs) {
  PrintConfig(config, true);
  std::vector<PaddleTensor> native_outputs, analysis_outputs;
-  const auto *analysis_config =
+  TestOneThreadPrediction(config, inputs, &native_outputs, false);
-      reinterpret_cast<const contrib::AnalysisConfig *>(config);
-  auto native_config = analysis_config->ToNativeConfig();
-  TestOneThreadPrediction(&native_config, inputs, &native_outputs, false);
  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
  CompareResult(analysis_outputs, native_outputs);
 }

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -99,24 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) {
    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
  }
-  std::vector<PaddleTensor> native_outputs;
-  NativeConfig native_config;
-  SetConfig<NativeConfig>(&native_config, model_dir, true, false,
-                          FLAGS_batch_size);
-  TestOneThreadPrediction(
-      reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
-      &native_outputs, false);
-  std::vector<PaddleTensor> analysis_outputs;
  contrib::AnalysisConfig analysis_config;
-  analysis_config.EnableUseGpu(50, 0);
  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
                                     use_tensorrt, FLAGS_batch_size);
-  TestOneThreadPrediction(
+  CompareNativeAndAnalysis(
-      reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all,
+      reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config),
-      &analysis_outputs, true);
+      inputs_all);
-  CompareResult(native_outputs, analysis_outputs);
 }
 TEST(TensorRT_mobilenet, compare) {

--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
 cc_binary(visualizer SRCS visualizer.cc DEPS analysis
    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
-if(WIN32)
-  target_link_libraries(visualizer shlwapi)
-endif(WIN32)
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -137,7 +137,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv algorithm ---------------------
    cudnnConvolutionFwdAlgo_t algo;
    auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
    bool half_float = false;
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
@@ -158,6 +157,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
      VLOG(5) << "NOT use cudnn_tensor_op_math";
    }
 #endif
+    Tensor cudnn_workspace;
+    void* cudnn_workspace_ptr = nullptr;
    auto x_dims = framework::vectorize(input->dims());
    auto f_dims = framework::vectorize(filter->dims());
@@ -180,21 +181,26 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
                .Var(kCUDNNFwdAlgoCache)
                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
      }
+      cudnn_workspace =
+          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
+              framework::make_ddim(
+                  {static_cast<int64_t>(workspace_size_limit)}),
+              dev_ctx);
+      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
      algo = algo_cache->GetAlgorithm(
          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
            int returned_algo_count;
            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
                fwd_perf_stat;
-            auto cudnn_find_func = [&](void* cudnn_workspace) {
-              CUDNN_ENFORCE(
+            CUDNN_ENFORCE(
-                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                    handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                      filter_data, cudnn_conv_desc, cudnn_output_desc,
+                    filter_data, cudnn_conv_desc, cudnn_output_desc,
-                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                    output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      fwd_perf_stat.data(), cudnn_workspace,
+                    fwd_perf_stat.data(), cudnn_workspace_ptr,
-                      workspace_size_limit));
+                    workspace_size_limit));
-            };
-            workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
            VLOG(3) << "Perf result: (algo: stat, time, memory)";
            for (int i = 0; i < returned_algo_count; ++i) {
@@ -219,17 +225,23 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                      "workspace_size to be allocated exceeds the limit");
+    // Allocate on GPU memory
+    if (!cudnn_workspace_ptr) {
+      cudnn_workspace =
+          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
+              framework::make_ddim(
+                  {static_cast<int64_t>(workspace_size_in_bytes)}),
+              dev_ctx);
+      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
+    }
    // ------------------- cudnn conv forward ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
    for (int i = 0; i < groups; i++) {
-      auto cudnn_func = [&](void* cudnn_workspace) {
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+          cudnn_filter_desc, filter_data + i * group_offset_filter,
-            cudnn_filter_desc, filter_data + i * group_offset_filter,
+          cudnn_conv_desc, algo, cudnn_workspace_ptr, workspace_size_in_bytes,
-            cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
+          &beta, cudnn_output_desc, output_data + i * group_offset_out));
-            &beta, cudnn_output_desc, output_data + i * group_offset_out));
-      };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
    }
  }
 };
@@ -297,6 +309,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
        layout, framework::vectorize2int(filter->dims()), groups);
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Enable Tensor Core for cudnn backward
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      VLOG(5) << "use cudnn_tensor_op_math for backward";
+    } else {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+      VLOG(5) << "NOT use cudnn_tensor_op_math for backward";
+    }
+#endif
    int input_channels = input->dims()[1];
    int input_height, input_width, input_depth;
    if (input->dims().size() == 5) {
@@ -338,10 +365,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      workspace_size_limit = max_user_size * 1024 * 1024;
    }
+    Tensor cudnn_workspace;
+    void* cudnn_workspace_ptr = nullptr;
+    if ((input_data || filter_data) && exhaustive_search) {
+      cudnn_workspace =
+          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
+              framework::make_ddim(
+                  {static_cast<int64_t>(workspace_size_limit)}),
+              dev_ctx);
+      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
+    }
    auto x_dims = framework::vectorize(input->dims());
    auto f_dims = framework::vectorize(filter->dims());
    auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
      if (exhaustive_search) {
@@ -359,25 +396,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                  ->GetMutable<
                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
        }
        data_algo = data_algo_cache->GetAlgorithm(
            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
              int returned_algo_count;
              std::array<cudnnConvolutionBwdDataAlgoPerf_t,
                         kNUM_CUDNN_BWD_DATA_ALGS>
                  data_perf_stat;
-              auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) {
-                CUDNN_ENFORCE(
+              CUDNN_ENFORCE(platform::dynload::
-                    platform::dynload::
+                                cudnnFindConvolutionBackwardDataAlgorithmEx(
-                        cudnnFindConvolutionBackwardDataAlgorithmEx(
+                                    handle, cudnn_filter_desc, filter_data,
-                            handle, cudnn_filter_desc, filter_data,
+                                    cudnn_output_grad_desc, output_grad_data,
-                            cudnn_output_grad_desc, output_grad_data,
+                                    cudnn_conv_desc, cudnn_input_desc,
-                            cudnn_conv_desc, cudnn_input_desc, input_grad_data,
+                                    input_grad_data, kNUM_CUDNN_BWD_DATA_ALGS,
-                            kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
+                                    &returned_algo_count, data_perf_stat.data(),
-                            data_perf_stat.data(), cudnn_workspace,
+                                    cudnn_workspace_ptr, workspace_size_limit));
-                            workspace_size_limit));
-              };
-              workspace_handle.RunFunc(cudnn_find_bd_data_func,
-                                       workspace_size_limit);
              VLOG(3) << "Perf result: (algo: stat, time, memory)";
              for (int i = 0; i < returned_algo_count; ++i) {
@@ -428,25 +462,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                  ->GetMutable<
                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
        }
        filter_algo = f_algo_cache->GetAlgorithm(
            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
              int returned_algo_count;
              std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
                         kNUM_CUDNN_BWD_FILTER_ALGS>
                  filter_perf_stat;
-              auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) {
-                CUDNN_ENFORCE(
+              CUDNN_ENFORCE(
-                    platform::dynload::
+                  platform::dynload::
-                        cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                      cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                            handle, cudnn_input_desc, input_data,
+                          handle, cudnn_input_desc, input_data,
-                            cudnn_output_grad_desc, output_grad_data,
+                          cudnn_output_grad_desc, output_grad_data,
-                            cudnn_conv_desc, cudnn_filter_desc,
+                          cudnn_conv_desc, cudnn_filter_desc, filter_grad_data,
-                            filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
+                          kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
-                            &returned_algo_count, filter_perf_stat.data(),
+                          filter_perf_stat.data(), cudnn_workspace_ptr,
-                            cudnn_workspace, workspace_size_limit));
+                          workspace_size_limit));
-              };
-              workspace_handle.RunFunc(cudnn_find_bd_f_func,
-                                       workspace_size_limit);
              return filter_perf_stat[0].algo;
            });
        VLOG(3) << "cuDNN backward filter algo " << filter_algo;
@@ -467,6 +499,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
    }
+    // ------------------- cudnn conv workspace ---------------------
+    if (!cudnn_workspace_ptr) {
+      cudnn_workspace =
+          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
+              framework::make_ddim(
+                  {static_cast<int64_t>(workspace_size_in_bytes)}),
+              dev_ctx);
+      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
+    }
    // ------------------- cudnn conv backward data ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
    if (input_grad) {
@@ -474,15 +516,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      // Because beta is zero, it is unnecessary to reset input_grad.
      for (int i = 0; i < groups; i++) {
-        auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+            handle, &alpha, cudnn_filter_desc,
-              handle, &alpha, cudnn_filter_desc,
+            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
-              filter_data + i * group_offset_filter, cudnn_output_grad_desc,
+            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-              output_grad_data + i * group_offset_out, cudnn_conv_desc,
+            cudnn_workspace_ptr, workspace_size_in_bytes, &beta,
-              data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+            cudnn_input_desc, input_grad_data + i * group_offset_in));
-              cudnn_input_desc, input_grad_data + i * group_offset_in));
-        };
-        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
      }
    }
    // ------------------- cudnn conv backward filter ---------------------
@@ -490,15 +529,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
      // Because beta is zero, it is unnecessary to reset filter_grad.
      for (int i = 0; i < groups; i++) {
-        auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-              handle, &alpha, cudnn_input_desc,
+            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
-              input_data + i * group_offset_in, cudnn_output_grad_desc,
+            cudnn_conv_desc, filter_algo, cudnn_workspace_ptr,
-              output_grad_data + i * group_offset_out, cudnn_conv_desc,
+            workspace_size_in_bytes, &beta, cudnn_filter_desc,
-              filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+            filter_grad_data + i * group_offset_filter));
-              cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
-        };
-        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
      }
    }
  }

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -318,10 +318,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
    int groups = ctx.Attr<int>("groups");
    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+    if (fuse_residual_conn) {
+      PADDLE_ENFORCE(force_fp32_output != true,
+                     "residual fusion does not support force output with fp32");
+    }
    bool is_conv3d = strides.size() == 3U;
    // TODO(tpatejko): add support for dilation
@@ -355,14 +359,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          framework::DataTypeTrait<float>::DataType);
    }
+    if (fuse_residual_conn) {
+      auto residual = ctx.Input<Tensor>("ResidualData");
+      auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type());
+      if (dst_dt != residual_dt) dst_dt = residual_dt;
+    }
    // Get unique name for storing MKLDNN primitives
    std::string key;
    key.reserve(MaxKeyLength);
    platform::ConvMKLDNNHandler::AppendKey(
        &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
-        input->format(), dst_dt, ctx.op().Output("Output"));
+        input->format(), fuse_relu, fuse_residual_conn,
+        ctx.op().Output("Output"));
    const std::string key_conv_pd = key + "@conv_pd";
+    bool need_s8_to_u8 = false;
    std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
    std::shared_ptr<mkldnn::memory> src_memory_p = nullptr;
    std::shared_ptr<mkldnn::memory> user_src_memory_p = nullptr;
@@ -377,14 +390,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto src_key = key + "@src_mem_p";
    auto user_src_key = key + "@user_src_mem_p";
    auto src_reorder_key = key + "@src_mem_preorder_p";
+    auto residual_reorder_key = key + "@residual_data_mem_preorder_p";
    conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
        dev_ctx.GetBlob(prim_key));
    if (conv_p == nullptr || !is_test) {
      const K* filter_data = filter->data<K>();
      auto scale_in_data = ctx.Attr<float>("Scale_in");
+      auto scale_in_eltwise_data = ctx.Attr<float>("Scale_in_eltwise");
      auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
      auto scale_out_data =
          force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
+      float sum_scale =
+          fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
      bool is_multi_channel = scale_weights_data.size() > 1;
@@ -427,6 +446,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          weights_tz, memory::data_type::s8, chosen_memory_format);
      auto dst_md =
          platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
      // create a conv primitive descriptor and save it for usage in backward
      if (bias) {
        bias_tz = paddle::framework::vectorize2int(bias->dims());
@@ -434,11 +454,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                               memory::format::x);
        conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
                                       strides, paddings, mkldnn_engine,
-                                       fuse_relu, output_shift_scale, is_test);
+                                       fuse_relu, fuse_residual_conn,
+                                       output_shift_scale, sum_scale, is_test);
      } else {
-        conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
+        conv_pd =
-                                       paddings, mkldnn_engine, fuse_relu,
+            ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                                       output_shift_scale, is_test);
+                                 mkldnn_engine, fuse_relu, fuse_residual_conn,
+                                 output_shift_scale, sum_scale, is_test);
      }
      // Save conv_pd/src_memory/weights_memory for backward pass
      dev_ctx.SetBlob(key_conv_pd, conv_pd);
@@ -463,7 +485,41 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          user_weights_memory_p, pipeline, is_test, true, scale_weights_data,
          mask_reorder);
-      if (!force_fp32_output) {
+      if (fuse_residual_conn) {
+        auto residual_param = ctx.Input<Tensor>("ResidualData");
+        PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
+                          "Output and elementwise parameter need to have the "
+                          "same dimension sizes");
+        auto residual_dt =
+            paddle::framework::ToMKLDNNDataType(residual_param->type());
+        if (residual_param->format() != handler->GetDstFormat()) {
+          auto residual_data_tz =
+              paddle::framework::vectorize2int(residual_param->dims());
+          auto user_residual_md = platform::MKLDNNMemDesc(
+              residual_data_tz, residual_dt, residual_param->format());
+          if (residual_dt == mkldnn::memory::data_type::u8) {
+            dst_memory_p = platform::SetDstMemory<uint8_t>(
+                ctx, output, residual_param, user_residual_md, handler,
+                &pipeline);
+          } else {
+            need_s8_to_u8 = fuse_relu;
+            dst_memory_p = platform::SetDstMemory<int8_t>(
+                ctx, output, residual_param, user_residual_md, handler,
+                &pipeline);
+          }
+        } else {
+          output->ShareDataWith(*residual_param);
+          if (residual_dt == mkldnn::memory::data_type::u8) {
+            dst_memory_p =
+                platform::SetDstMemory<uint8_t>(ctx, output, handler);
+          } else {
+            need_s8_to_u8 = fuse_relu;
+            dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
+          }
+        }
+      } else if (!force_fp32_output) {
        if (fuse_relu) {
          dst_memory_p = platform::SetDstMemory<uint8_t>(ctx, output, handler);
        } else {
@@ -476,11 +532,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      // create convolution op primitive
      auto scale_bias_key = key + "@scale_bias";
      if (bias) {
-        const float* bias_data = bias->data<float>();
+        const K* bias_data = bias->data<K>();
        auto user_bias_md = platform::MKLDNNMemDesc(
-            {bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
+            {bias_tz}, platform::MKLDNNGetDataType<K>(), memory::format::x);
        auto user_bias_memory_p = handler->AcquireBiasMemory(
-            user_bias_md, to_void_cast<float>(bias_data));
+            user_bias_md, to_void_cast<K>(bias_data));
        std::shared_ptr<mkldnn::memory> bias_memory_p;
        int mask_reorder = is_multi_channel ? 1 << 0 : 1;
        int count =
@@ -526,26 +582,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
                                                      mkldnn_engine, key));
      }
-      if (!force_fp32_output) {
+      if (fuse_residual_conn) {
+        auto residual_param = ctx.Input<Tensor>("ResidualData");
+        auto residual_dt =
+            paddle::framework::ToMKLDNNDataType(residual_param->type());
+        output->ShareDataWith(*residual_param);
+        if (residual_dt == mkldnn::memory::data_type::u8) {
+          platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
+                                                 &dst_memory_p);
+        } else {
+          platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
+                                                &dst_memory_p);
+        }
+      } else if (!force_fp32_output) {
        if (fuse_relu) {
-          dst_memory_p =
+          platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
-              platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler);
+                                                 &dst_memory_p);
        } else {
-          dst_memory_p =
+          platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
-              platform::SetDstMemoryHandler<int8_t>(ctx, output, handler);
+                                                &dst_memory_p);
        }
      } else {
-        dst_memory_p =
+        platform::SetDstMemoryHandler<float>(ctx, output, handler,
-            platform::SetDstMemoryHandler<float>(ctx, output, handler);
+                                             &dst_memory_p);
      }
      if (src_memory_reorder_p) {
        pipeline.push_back(*src_memory_reorder_p);
      }
+      auto residual_reorder_p = std::static_pointer_cast<mkldnn::memory>(
+          dev_ctx.GetBlob(residual_reorder_key));
+      if (residual_reorder_p) {
+        pipeline.push_back(*residual_reorder_p);
+      }
      pipeline.push_back(*conv_p);
    }
    // push primitive to stream and wait until it's executed
    stream(stream::kind::eager).submit(pipeline).wait();
+    if (need_s8_to_u8) {
+      output->mutable_data<uint8_t>(ctx.GetPlace());
+    }
    output->set_layout(DataLayout::kMKLDNN);
    output->set_format(GetMKLDNNFormat(*dst_memory_p));
  }
@@ -577,11 +658,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  }
  mkldnn::primitive_attr CreatePostOps(
-      bool fuse_relu, const std::vector<float> output_shift_scale) const {
+      bool fuse_relu, bool fuse_residual_conn,
+      const std::vector<float> output_shift_scale, float sum_scale) const {
    mkldnn::primitive_attr conv_attr;
    mkldnn::post_ops post_operations;
    int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
    conv_attr.set_output_scales(mask, output_shift_scale);
+    if (fuse_residual_conn) {
+      post_operations.append_sum(sum_scale);
+    }
    if (fuse_relu) {
      constexpr float scale = 1.0f;
      constexpr float negative_slope = 0.0f;
@@ -622,8 +707,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const memory::desc& dst, const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const bool fuse_residual_conn,
                       const std::vector<float> output_shift_scale,
-                       bool is_test) const {
+                       const float sum_scale, bool is_test) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
@@ -634,8 +720,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims,
        padding_dims, padding_dims, mkldnn::padding_kind::zero);
-    mkldnn::primitive_attr conv_attr =
+    mkldnn::primitive_attr conv_attr = CreatePostOps(
-        CreatePostOps(fuse_relu, output_shift_scale);
+        fuse_relu, fuse_residual_conn, output_shift_scale, sum_scale);
    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
        conv_desc, conv_attr, engine);
@@ -675,8 +761,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const bool fuse_residual_conn,
                       const std::vector<float> output_shift_scale,
-                       bool is_test) const {
+                       const float sum_scale, bool is_test) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
@@ -687,8 +774,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        propagation, mkldnn::convolution_direct, src, weights, bias, dst,
        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-    mkldnn::primitive_attr conv_attr =
+    mkldnn::primitive_attr conv_attr = CreatePostOps(
-        CreatePostOps(fuse_relu, output_shift_scale);
+        fuse_relu, fuse_residual_conn, output_shift_scale, sum_scale);
    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
        conv_desc, conv_attr, engine);
@@ -891,7 +978,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
    }
    stream(stream::kind::eager).submit(pipeline).wait();
-  }  // Compute()
+  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/data_norm_op.h"
+#include <string>
+#include "paddle/fluid/framework/data_layout.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+class DataNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSize"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSum"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Means"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Scales"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
+    const auto x_dims = ctx->GetInputDim("X");
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+    const int64_t C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
+    ctx->SetOutputDim("Y", x_dims);
+    ctx->SetOutputDim("Means", {C});
+    ctx->SetOutputDim("Scales", {C});
+    ctx->ShareLoD("X", "Y");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type = ctx.Input<Tensor>("X")->type();
+    // By default, the type of the scale, bias, mean,
+    // and var tensors should both be float. (For float or float16 input tensor)
+    // or double (For double input tensor).
+    auto dn_param_type = framework::proto::VarType::FP32;
+    if (input_data_type == framework::proto::VarType::FP64) {
+      dn_param_type = framework::proto::VarType::FP64;
+    }
+    PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input<Tensor>("BatchSize")->type(),
+                      "BatchSize input should be of float type");
+    PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input<Tensor>("BatchSum")->type(),
+                      "BatchSum input should be of float type");
+    PADDLE_ENFORCE_EQ(dn_param_type,
+                      ctx.Input<Tensor>("BatchSquareSum")->type(),
+                      "BatchSquareSum input should be of float type");
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library);
+  }
+};
+class DataNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    // AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<float>("epsilon", "")
+        .SetDefault(1e-4)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
+    AddInput("X", "The input tensor");
+    AddInput("BatchSize",
+             "BatchSize is a 1-dimensional tensor of size C "
+             "that is applied to the output");
+    AddInput("BatchSum",
+             "BatchSum is a 1-dimensional tensor of size C "
+             "that is applied to the output");
+    AddInput("BatchSquareSum",
+             "The global BatchSquareSum (for training) or "
+             "estimated BatchSquareSum (for testing)");
+    AddOutput("Y", "result after normalization");
+    AddOutput("Means",
+              "Mean of the history data batch, "
+              "will apply to output when training")
+        .AsIntermediate();
+    AddOutput("Scales",
+              "Scales of the history data batch, "
+              "will apply to output when training")
+        .AsIntermediate();
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Data Normalization.
+Can be used as a normalizer function for data
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
+)DOC");
+  }
+};
+template <typename T>
+class DataNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    // const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2");
+    const int N = x_dims[0];
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("Means");
+    auto *scales = ctx.Output<Tensor>("Scales");
+    // alloc memory
+    y->mutable_data<T>(ctx.GetPlace());
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    ConstEigenVectorArrayMap<T> b_size_arr(
+        ctx.Input<Tensor>("BatchSize")->data<T>(), C);
+    ConstEigenVectorArrayMap<T> b_sum_arr(
+        ctx.Input<Tensor>("BatchSum")->data<T>(), C);
+    ConstEigenVectorArrayMap<T> b_square_sum_arr(
+        ctx.Input<Tensor>("BatchSquareSum")->data<T>(), C);
+    EigenVectorArrayMap<T> means_arr(mean_out->mutable_data<T>(ctx.GetPlace()),
+                                     C);
+    EigenVectorArrayMap<T> scales_arr(scales->mutable_data<T>(ctx.GetPlace()),
+                                      C);
+    means_arr = b_sum_arr / b_size_arr;
+    scales_arr = (b_size_arr / b_square_sum_arr).sqrt();
+    switch (data_layout) {
+      case DataLayout::kNCHW:  // because it's two dimensions, so make no
+                               // difference
+      case DataLayout::kNHWC: {
+        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C, N) =
+            (ConstEigenArrayMap<T>(x->data<T>(), C, N).colwise() - means_arr)
+                .colwise() *
+            scales_arr;
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %d", data_layout);
+    }
+  }
+};
+class DataNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSize"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSum"), "");
+    PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Means"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scales"), "");
+    // check output
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSize")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSum")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSquareSum")),
+                   "");
+    const auto x_dims = ctx->GetInputDim("X");
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->SetOutputDim(framework::GradVarName("BatchSize"), {C});
+    ctx->SetOutputDim(framework::GradVarName("BatchSum"), {C});
+    ctx->SetOutputDim(framework::GradVarName("BatchSquareSum"), {C});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout, library);
+  }
+};
+template <typename T>
+class DataNormGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *batch_size = ctx.Input<Tensor>("BatchSize");
+    const auto *batch_sum = ctx.Input<Tensor>("BatchSum");
+    const auto *batch_square_sum = ctx.Input<Tensor>("BatchSquareSum");
+    const auto *scales = ctx.Input<Tensor>("Scales");
+    const auto *means = ctx.Input<Tensor>("Means");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    // Get the size for each dimension.
+    // NCHW [batch_size, in_channels, in_height, in_width]
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2");
+    const int N = x_dims[0];
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_batch_size =
+        ctx.Output<Tensor>(framework::GradVarName("BatchSize"));
+    auto *d_batch_sum = ctx.Output<Tensor>(framework::GradVarName("BatchSum"));
+    auto *d_batch_square_sum =
+        ctx.Output<Tensor>(framework::GradVarName("BatchSquareSum"));
+    EigenVectorArrayMap<T> d_batch_size_arr(
+        d_batch_size->mutable_data<T>(ctx.GetPlace()), C);
+    EigenVectorArrayMap<T> d_batch_sum_arr(
+        d_batch_sum->mutable_data<T>(ctx.GetPlace()), C);
+    EigenVectorArrayMap<T> d_batch_square_sum_arr(
+        d_batch_square_sum->mutable_data<T>(ctx.GetPlace()), C);
+    d_batch_size_arr.setZero();
+    d_batch_sum_arr.setZero();
+    d_batch_square_sum_arr.setZero();
+    const float epsilon = ctx.Attr<float>("epsilon");
+    switch (
+        data_layout) {  // because it's two dimensions, so make no difference
+      case DataLayout::kNCHW:
+      case DataLayout::kNHWC: {
+        ConstEigenVectorArrayMap<T> scales_arr(scales->data<T>(), C);
+        ConstEigenVectorArrayMap<T> means_arr(means->data<T>(), C);
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N);
+        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N);
+        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C, N);
+        d_x_arr.setZero();
+        for (int nc = 0; nc < N; ++nc) {
+          d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr;
+        }
+        // calculate data sum and squre sum
+        ConstEigenVectorArrayMap<T> batch_size_arr(batch_size->data<T>(), C);
+        ConstEigenVectorArrayMap<T> batch_sum_arr(batch_sum->data<T>(), C);
+        ConstEigenVectorArrayMap<T> batch_square_sum_arr(
+            batch_square_sum->data<T>(), C);
+        Eigen::Array<T, Eigen::Dynamic, 1> sample_sum(C);
+        Eigen::Array<T, Eigen::Dynamic, 1> sample_square_sum(C);
+        // calculate data sample sum and square sum
+        sample_sum.setZero();
+        sample_square_sum.setZero();
+        for (int nc = 0; nc < N; ++nc) {
+          sample_sum += x_arr.col(nc);
+          sample_square_sum += (x_arr.col(nc) - means_arr).square();
+        }
+        // calculate gradient
+        d_batch_size_arr.setConstant(N);
+        d_batch_sum_arr = sample_sum;
+        d_batch_square_sum_arr = sample_square_sum + d_batch_size_arr * epsilon;
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+    }
+  }
+};
+class DataNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("data_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetInput("BatchSize", Input("BatchSize"));
+    op->SetInput("BatchSum", Input("BatchSum"));
+    op->SetInput("BatchSquareSum", Input("BatchSquareSum"));
+    op->SetInput("Scales", Output("Scales"));
+    op->SetInput("Means", Output("Means"));
+    op->SetAttrMap(Attrs());
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("BatchSize"), InputGrad("BatchSize"));
+    op->SetOutput(framework::GradVarName("BatchSum"), InputGrad("BatchSum"));
+    op->SetOutput(framework::GradVarName("BatchSquareSum"),
+                  InputGrad("BatchSquareSum"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(data_norm, ops::DataNormOp, ops::DataNormOpMaker,
+                  ops::DataNormGradMaker);
+REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    data_norm, ops::DataNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DataNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    data_norm_grad,
+    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/data_norm_op.h
+++ b/paddle/fluid/operators/data_norm_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class DataNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+template <typename DeviceContext, typename T>
+class DataNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub_grad,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/jit/kernels.h"
+namespace paddle {
+namespace operators {
+void FusionSeqPoolConcatOp::InferShape(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
+                    "Inputs(X) of FusionSeqPoolConcatOp should not be empty.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Output(Out) of FusionSeqPoolConcatOp should not be null.");
+  int axis = ctx->Attrs().Get<int>("axis");
+  PADDLE_ENFORCE_EQ(axis, 1,
+                    "FusionSeqPoolConcatOp only supports concat axis=1 yet.");
+  auto ins_dims = ctx->GetInputsDim("X");
+  const size_t n = ins_dims.size();
+  PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0.");
+  if (n == 1) {
+    LOG(WARNING) << "Only have one input, may waste memory";
+  }
+  // The output height should be confirmed in Compute,
+  // since input lod is not accessible here.
+  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2UL,
+                    "The dims size of first input should be 2.");
+  ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
+}
+framework::OpKernelType FusionSeqPoolConcatOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]), ctx.GetPlace());
+}
+void FusionSeqPoolConcatOpMaker::Make() {
+  AddInput("X", "(LoDTensor) Input tensors of this operator.").AsDuplicable();
+  AddOutput("Out", "(LoDTensor) Output tensor of concat operator.");
+  AddAttr<std::string>("pooltype",
+                       "(string, default 'SUM') some of the pooling "
+                       "pooltype of SequencePoolOp.")
+      .SetDefault("SUM")
+      .InEnum({"AVERAGE", "SUM", "SQRT"});
+  AddAttr<int>("axis",
+               "The axis along which the input tensors will be concatenated. "
+               "Only supports concat axis=1 yet.")
+      .SetDefault(1);
+  AddComment(R"DOC(
+Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
+)DOC");
+}
+template <typename T>
+class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    std::string pooltype = ctx.Attr<std::string>("pooltype");
+    auto x0_lod = ins[0]->lod();
+    auto x0_dims = ins[0]->dims();
+    auto y_dims = out->dims();
+    size_t bs = x0_lod[0].size() - 1;
+    out->Resize({static_cast<int64_t>(bs), y_dims[1]});
+    framework::LoD y_lod(1);
+    y_lod[0].resize(bs + 1);
+    for (size_t i = 0; i <= bs; ++i) {
+      y_lod[0][i] = i;
+    }
+    out->set_lod(y_lod);
+    auto place = ctx.GetPlace();
+    T* y_data = out->mutable_data<T>(place);
+    int w = ins[0]->numel() / x0_dims[0];
+    PADDLE_ENFORCE_EQ(y_dims[1] % w, 0,
+                      "The output of dims[1] should be dividable of w");
+    jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum);
+    if (pooltype == "AVERAGE") {
+      attr.type = jit::SeqPoolType::kAvg;
+    } else if (pooltype == "SQRT") {
+      attr.type = jit::SeqPoolType::kSqrt;
+    }
+    auto seqpool =
+        jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
+            attr);
+    size_t n = ins.size();
+    size_t dst_step_size = n * w;
+    for (size_t i = 0; i < n; ++i) {
+      auto x_dims = ins[i]->dims();
+      auto x_lod = ins[i]->lod()[0];
+      const T* src = ins[i]->data<T>();
+      T* dst = y_data + i * w;
+      PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]), w,
+                        "Width of all inputs should be equal.");
+      PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1,
+                        "Batchsize of all inputs should be equal.");
+      for (size_t j = 0; j < bs; ++j) {
+        attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
+        seqpool(src, dst, &attr);
+        dst += dst_step_size;
+        src += attr.h * attr.w;
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_seqpool_concat, ops::FusionSeqPoolConcatOp,
+                  ops::FusionSeqPoolConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OP_CPU_KERNEL(fusion_seqpool_concat,
+                       ops::FusionSeqPoolConcatKernel<float>,
+                       ops::FusionSeqPoolConcatKernel<double>);
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+class FusionSeqPoolConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+class FusionSeqPoolConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -52,11 +52,11 @@ struct BenchFunc {
    for (int i = 0; i < FLAGS_burning; ++i) {
      tgt(args...);
    }
-    auto start = paddle::platform::PosixInNsec() / 1e-3;
+    auto start = paddle::platform::PosixInNsec() * 1e-3;
    for (int i = 0; i < FLAGS_repeat; ++i) {
      tgt(args...);
    }
-    auto end = paddle::platform::PosixInNsec() / 1e-3;
+    auto end = paddle::platform::PosixInNsec() * 1e-3;
    return static_cast<double>(end - start) / FLAGS_repeat;
  }
 };
@@ -190,6 +190,26 @@ void BenchGRUKernel() {
  }
 }
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchSeqPoolKernel() {
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
+  for (auto type : pool_types) {
+    for (int w : TestSizes()) {
+      jit::seq_pool_attr_t attr(w, type);
+      for (int h : TestSizes()) {
+        attr.h = h;
+        std::vector<T> x(h * w), y(w);
+        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
+        const T* x_data = x.data();
+        T* y_data = y.data();
+        BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
+                                                            y_data, &attr);
+      }
+    }
+  }
+}
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
  BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
+  // seq pool function
+  BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
 }
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
 USE_JITKERNEL_GEN(kGRUHtPart1)
 USE_JITKERNEL_GEN(kGRUHtPart2)
 USE_JITKERNEL_GEN(kNCHW16CMulNC)
+USE_JITKERNEL_GEN(kSeqPool)
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/gen/seqpool.h"
+#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+void SeqPoolJitCode::genCode() {
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 8;
+  const int num_block = w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  int rest_num_regs = num_block % max_num_regs;
+  mov(reg32_int_h, dword[param_attr]);
+  if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
+    mov(reg_tmp, reinterpret_cast<size_t>(exp_float_consts));
+    vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]);
+    mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
+    fild(dword[param_attr]);
+    fstp(dword[reg_tmp]);
+    vmovss(xmm_t(0), ptr[reg_tmp]);
+    if (type_ == SeqPoolType::kSqrt) {
+      vsqrtps(xmm_t(0), xmm_t(0));
+    }
+    vdivps(xmm_t(1), xmm_t(1), xmm_t(0));
+    vmovss(ptr[reg_tmp], xmm_t(1));
+  }
+  const int group_len = max_num_regs * block * sizeof(float);
+  for (int g = 0; g < num_groups; ++g) {
+    pool_height<ymm_t>(g * group_len, block, max_num_regs);
+  }
+  if (rest_num_regs > 0) {
+    pool_height<ymm_t>(num_groups * group_len, block, rest_num_regs);
+  }
+  // part of rest_w * height
+  const int rest = w_ % block;
+  pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs);
+  ret();
+}
+class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
+ public:
+  bool UseMe(const seq_pool_attr_t& attr) const override {
+    return platform::MayIUse(platform::avx);
+  }
+  size_t CodeSize(const seq_pool_attr_t& attr) const override {
+    return 96 +
+           ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
+                4 /* load, mul and save */ +
+            256) *
+               8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const seq_pool_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.w, 0);
+    PADDLE_ENFORCE_GT(attr.h, 0);
+    return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
+  }
+};
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+namespace gen = paddle::operators::jit::gen;
+REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+class SeqPoolJitCode : public JitCode {
+ public:
+  explicit SeqPoolJitCode(const seq_pool_attr_t& attr,
+                          size_t code_size = 256 * 1024,
+                          void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
+    if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
+          type_ == SeqPoolType::kSqrt)) {
+      LOG(FATAL) << "Only support sum pool yet ";
+    }
+    fp_h_[0] = 1.f;
+    this->genCode();
+  }
+  virtual const char* name() const {
+    std::string base = "SeqPoolJitCode";
+    if (type_ == SeqPoolType::kSum) {
+      base += "_Sum";
+    } else if (type_ == SeqPoolType::kAvg) {
+      base += "_Avg";
+    } else if (type_ == SeqPoolType::kSqrt) {
+      base += "_Sqrt";
+    }
+    base += ("_W" + std::to_string(w_));
+    return base.c_str();
+  }
+  void genCode() override;
+ protected:
+  template <typename JMM>
+  void pool_height(int w_offset, int block, int max_num_regs) {
+    int offset = w_offset;
+    for (int i = 0; i < max_num_regs; ++i) {
+      vmovups(JMM(i), ptr[param_src + offset]);
+      offset += sizeof(float) * block;
+    }
+    cmp(reg32_int_h, 1);
+    Label l_next_h, l_h_done;
+    jle(l_h_done, T_NEAR);
+    mov(reg_h_i, 1);
+    mov(reg_tmp, param_src);
+    add(reg_tmp, w_ * sizeof(float) + w_offset);
+    L(l_next_h);
+    {
+      mov(reg_ptr_src_i, reg_tmp);
+      for (int i = 0; i < max_num_regs; ++i) {
+        vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]);
+        // sum anyway
+        vaddps(JMM(i), JMM(i), JMM(i + max_num_regs));
+        add(reg_ptr_src_i, sizeof(float) * block);
+      }
+      inc(reg_h_i);
+      add(reg_tmp, w_ * sizeof(float));
+      cmp(reg_h_i, reg32_int_h);
+      jl(l_next_h, T_NEAR);
+    }
+    L(l_h_done);
+    // save right now
+    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
+      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
+      vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]);
+    }
+    offset = w_offset;
+    for (int i = 0; i < max_num_regs; ++i) {
+      if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
+        vmulps(JMM(i), JMM(i), JMM(max_num_regs));
+      }
+      vmovups(ptr[param_dst + offset], JMM(i));
+      offset += sizeof(float) * block;
+    }
+  }
+  void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) {
+    const int rest_used_num_regs = load_rest(rest, w_offset, 0);
+    const bool has_block4 = rest / 4 > 0;
+    const bool has_block2 = (rest % 4) / 2 > 0;
+    const bool has_block1 = (rest % 2) == 1;
+    cmp(reg32_int_h, 1);
+    Label l_next_h, l_h_done;
+    jle(l_h_done, T_NEAR);
+    mov(reg_h_i, 1);
+    mov(reg_tmp, param_src);
+    add(reg_tmp, w_ * sizeof(float) + w_offset);
+    L(l_next_h);
+    {
+      int reg_idx = 0;
+      mov(reg_ptr_src_i, reg_tmp);
+      if (has_block4) {
+        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
+        add(reg_ptr_src_i, sizeof(float) * 4);
+        reg_idx++;
+      }
+      if (has_block2) {
+        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
+        add(reg_ptr_src_i, sizeof(float) * 2);
+        reg_idx++;
+      }
+      if (has_block1) {
+        vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
+        reg_idx++;
+      }
+      PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
+                        "All heights should use same regs");
+      for (int i = 0; i < reg_idx; ++i) {
+        vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
+      }
+      inc(reg_h_i);
+      add(reg_tmp, w_ * sizeof(float));
+      cmp(reg_h_i, reg32_int_h);
+      jl(l_next_h, T_NEAR);
+    }
+    L(l_h_done);
+    // save right now
+    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
+      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
+      vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]);
+      for (int i = 0; i < rest_used_num_regs; ++i) {
+        vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs));
+      }
+    }
+    save_rest(rest, w_offset);
+  }
+  // return the number of used regs, use start from reg 0
+  int load_rest(int rest, int w_offset, const int num_shift_regs,
+                const int reg_start = 0) {
+    const bool has_block4 = rest / 4 > 0;
+    const bool has_block2 = (rest % 4) / 2 > 0;
+    const bool has_block1 = (rest % 2) == 1;
+    int reg_idx = reg_start;
+    if (has_block4) {
+      vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
+      w_offset += sizeof(float) * 4;
+      reg_idx++;
+    }
+    if (has_block2) {
+      vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
+      w_offset += sizeof(float) * 2;
+      reg_idx++;
+    }
+    if (has_block1) {
+      vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
+      reg_idx++;
+    }
+    return reg_idx;
+  }
+  // use reg start from 0
+  void save_rest(int rest, int w_offset, int reg_start = 0) {
+    const bool has_block4 = rest / 4 > 0;
+    const bool has_block2 = (rest % 4) / 2 > 0;
+    const bool has_block1 = (rest % 2) == 1;
+    int reg_idx = reg_start;
+    if (has_block4) {
+      vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx));
+      w_offset += sizeof(float) * 4;
+      reg_idx++;
+    }
+    if (has_block2) {
+      vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx));
+      w_offset += sizeof(float) * 2;
+      reg_idx++;
+    }
+    if (has_block1) {
+      vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx));
+    }
+  }
+ private:
+  float ALIGN32_BEG fp_h_[1] ALIGN32_END;
+  int w_;
+  SeqPoolType type_;
+  reg64_t param_src{abi_param1};
+  reg64_t param_dst{abi_param2};
+  reg64_t param_attr{abi_param3};
+  reg64_t reg_tmp{rax};
+  reg32_t reg32_int_h{r8d};
+  reg32_t reg32_fp_h{r9d};
+  reg64_t reg_h_i{r10};
+  reg64_t reg_ptr_src_i{r11};
+};
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -26,6 +26,7 @@ namespace jit {
 const char* to_string(KernelType kt) {
  switch (kt) {
+    ONE_CASE(kNone);
    ONE_CASE(kVMul);
    ONE_CASE(kVAdd);
    ONE_CASE(kVAddRelu);
@@ -45,12 +46,26 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kCRFDecoding);
    ONE_CASE(kLayerNorm);
    ONE_CASE(kNCHW16CMulNC);
+    ONE_CASE(kSeqPool);
    default:
      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
      return "NOT JITKernel";
  }
  return nullptr;
 }
+const char* to_string(SeqPoolType tp) {
+  switch (tp) {
+    ONE_CASE(kNonePoolType);
+    ONE_CASE(kSum);
+    ONE_CASE(kAvg);
+    ONE_CASE(kSqrt);
+    default:
+      PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
+      return "NOT PoolType";
+  }
+  return nullptr;
+}
 #undef ONE_CASE
 KernelType to_kerneltype(const std::string& act) {

--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -119,6 +119,7 @@ typename KernelTuples::func_type Get(
 }
 const char* to_string(KernelType kt);
+const char* to_string(SeqPoolType kt);
 KernelType to_kerneltype(const std::string& act);
@@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
     << "],act_cand[" << to_string(attr.act_cand) << "]";
  return os;
 }
+inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
+  os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
+     << to_string(attr.type) << "]";
+  return os;
+}
 }  // namespace jit
 }  // namespace operators

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -41,8 +41,16 @@ typedef enum {
  kCRFDecoding,
  kLayerNorm,
  kNCHW16CMulNC,
+  kSeqPool,
 } KernelType;
+typedef enum {
+  kNonePoolType = 0,
+  kSum = 1,
+  kAvg,
+  kSqrt,
+} SeqPoolType;
 template <typename T>
 struct XYZNTuples {
  typedef T data_type;
@@ -112,6 +120,21 @@ struct GRUTuples {
  typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };
+typedef struct seq_pool_attr_s {
+  int h, w;  // h should always be the first one
+  SeqPoolType type;
+  seq_pool_attr_s() = default;
+  explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1)
+      : h(height), w(width), type(pool_type) {}
+} seq_pool_attr_t;
+template <typename T>
+struct SeqPoolTuples {
+  typedef T data_type;
+  typedef seq_pool_attr_t attr_type;
+  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
+};
 template <typename T>
 struct CRFDecodingTuples {
  typedef T data_type;

--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
         (static_cast<int>(attr.act_cand) << act_type_shift);
 }
+template <>
+size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
+  size_t key = attr.w;
+  constexpr int pool_type_shift = 3;
+  return (key << pool_type_shift) + static_cast<int>(attr.type);
+}
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -9,3 +9,4 @@ USE_JITKERNEL_MORE(kVScal, mkl)
 USE_JITKERNEL_MORE(kVExp, mkl)
 USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
+USE_JITKERNEL_MORE(kSeqPool, mkl)
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -72,6 +72,26 @@ void VExp<double>(const double* x, double* y, int n) {
  platform::dynload::vdExp(n, x, y);
 }
+template <>
+void VCopy<float>(const float* x, float* y, int n) {
+  platform::dynload::cblas_scopy(n, x, 1, y, 1);
+}
+template <>
+void VCopy<double>(const double* x, double* y, int n) {
+  platform::dynload::cblas_dcopy(n, x, 1, y, 1);
+}
+template <>
+void VAXPY<float>(float a, const float* x, float* y, int n) {
+  platform::dynload::cblas_saxpy(n, a, x, 1, y, 1);
+}
+template <>
+void VAXPY<double>(double a, const double* x, double* y, int n) {
+  platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
+}
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
 bool VMulKernel<float>::UseMe(const int& d) const {
@@ -103,6 +123,16 @@ bool VTanhKernel<float>::UseMe(const int& d) const {
  return d > 7;
 }
+template <>
+bool SeqPoolKernel<float>::UseMe(const seq_pool_attr_t& attr) const {
+  return true;
+}
+template <>
+bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
+  return true;
+}
 #define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
  template <>                                            \
  bool func##Kernel<double>::UseMe(const int& d) const { \
@@ -135,5 +165,6 @@ REGISTER_MKL_KERNEL(kVScal, VScal);
 REGISTER_MKL_KERNEL(kVExp, VExp);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
+REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
 #undef REGISTER_MKL_KERNEL
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <cmath>
 #include <type_traits>
 #include "paddle/fluid/operators/jit/kernel_base.h"
@@ -35,6 +36,12 @@ void VScal(const T* a, const T* x, T* y, int n);
 template <typename T>
 void VExp(const T* x, T* y, int n);
+template <typename T>
+void VCopy(const T* x, T* y, int n);
+template <typename T>
+void VAXPY(T a, const T* x, T* y, int n);
 template <typename T>
 void VSigmoid(const T* x, T* y, int n) {
  const T min = SIGMOID_THRESHOLD_MIN;
@@ -60,6 +67,23 @@ void VTanh(const T* x, T* y, int n) {
  }
 }
+template <typename T>
+void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
+  VCopy<T>(x, y, attr->w);
+  for (int h = 1; h != attr->h; ++h) {
+    VAXPY<T>(static_cast<T>(1), x + h * attr->w, y, attr->w);
+  }
+  if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
+    T scalar = static_cast<T>(1);
+    if (attr->type == SeqPoolType::kAvg) {
+      scalar = scalar / static_cast<T>(attr->h);
+    } else {
+      scalar = scalar / std::sqrt(static_cast<T>(attr->h));
+    }
+    VScal<T>(&scalar, y, y, attr->w);
+  }
+}
 #define DECLARE_MKL_KERNEL(name, tuples)                             \
  template <typename T>                                              \
  class name##Kernel : public KernelMore<tuples<T>> {                \
@@ -81,6 +105,8 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
 DECLARE_MKL_KERNEL(VSigmoid, XYNTuples);
 DECLARE_MKL_KERNEL(VTanh, XYNTuples);
+DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
 #undef DECLARE_MKL_KERNEL
 }  // namespace mkl

--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -26,3 +26,4 @@ USE_JITKERNEL_REFER(kGRUHtPart2)
 USE_JITKERNEL_REFER(kCRFDecoding)
 USE_JITKERNEL_REFER(kLayerNorm)
 USE_JITKERNEL_REFER(kNCHW16CMulNC)
+USE_JITKERNEL_REFER(kSeqPool)
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -47,4 +47,6 @@ REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm);
 REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC);
+REGISTER_REFER_KERNEL(kSeqPool, SeqPool);
 #undef REGISTER_REFER_KERNEL
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -332,6 +332,28 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
  }
 }
+template <typename T>
+void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
+  for (int w = 0; w < attr->w; ++w) {
+    const T* src = x + w;
+    T* dst = y + w;
+    *dst = static_cast<T>(0);
+    for (int h = 0; h < attr->h; ++h) {
+      *dst = *dst + *src;
+      src += attr->w;
+    }
+  }
+  if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
+    T scalar = static_cast<T>(1);
+    if (attr->type == SeqPoolType::kAvg) {
+      scalar = scalar / static_cast<T>(attr->h);
+    } else {
+      scalar = scalar / std::sqrt(static_cast<T>(attr->h));
+    }
+    VScal<T>(&scalar, y, y, attr->w);
+  }
+}
 #define DECLARE_REFER_KERNEL(name, tuples)             \
  template <typename T>                                \
  class name##Kernel : public ReferKernel<tuples<T>> { \
@@ -370,6 +392,8 @@ DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples);
 DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples);
+DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples);
 #undef DECLARE_REFER_KERNEL
 }  // namespace refer

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -211,6 +211,24 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
  }
 };
+template <typename T>
+struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>,
+                         std::vector<T>> {
+  void operator()(const typename jit::SeqPoolTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const std::vector<T>& yref,
+                  const typename jit::SeqPoolTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size() % yref.size(), 0);
+    int w = yref.size();
+    std::vector<T> y(w);
+    const T* x_data = x.data();
+    const T* yref_data = yref.data();
+    T* y_data = y.data();
+    tgt(x_data, y_data, &attr);
+    ExpectEQ<T>(y_data, yref_data, w);
+  }
+};
 template <paddle::operators::jit::KernelType KT, typename KernelTuples,
          typename PlaceType, typename... Args>
 void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
@@ -415,6 +433,31 @@ void TestGRUKernel() {
  }
 }
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestSeqPoolKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
+  for (auto type : pool_types) {
+    for (int w : TestSizes()) {
+      jit::seq_pool_attr_t attr(w, type);
+      for (int h : TestSizes()) {
+        attr.h = h;
+        auto ref = jit::GetRefer<KT, jit::SeqPoolTuples<T>>();
+        EXPECT_TRUE(ref != nullptr);
+        std::vector<T> x(h * w), yref(w);
+        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
+        const T* x_data = x.data();
+        T* yref_data = yref.data();
+        ref(x_data, yref_data, &attr);
+        VLOG(10) << attr;
+        TestAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType, std::vector<T>,
+                     std::vector<T>>(attr, x, yref, attr);
+      }
+    }
+  }
+}
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void TestNCHW16CMulNCKernel() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
@@ -569,6 +612,12 @@ TEST(JITKernel, kGRUHtPart2) {
  TestGRUKernel<jit::kGRUHtPart2, double, paddle::platform::CPUPlace>();
 }
+TEST(JITKernel, kSeqPool) {
+  namespace jit = paddle::operators::jit;
+  TestSeqPoolKernel<jit::kSeqPool, float, paddle::platform::CPUPlace>();
+  TestSeqPoolKernel<jit::kSeqPool, double, paddle::platform::CPUPlace>();
+}
 TEST(JITKernel, kNCHW16CMulNC) {
  namespace jit = paddle::operators::jit;
  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float,

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -51,7 +51,7 @@ math_library(pooling)
 math_library(selected_rows_functor DEPS selected_rows math_function blas)
 math_library(sequence2batch)
 math_library(sequence_padding)
-math_library(sequence_pooling DEPS math_function)
+math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -195,6 +195,10 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::SelectedRows& input1,
                  framework::Tensor* input2) {
+    if (UNLIKELY(input1.rows().size() == 0)) {
+      LOG(WARNING) << "input selected rows is empty!";
+      return;
+    }
    auto in1_height = input1.height();
    auto in2_dims = input2->dims();
    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);

--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
@@ -239,15 +240,33 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      last_pool(context, input, output);
      return;
    }
    if (pooltype == "FIRST") {
      math::FirstSeqPoolFunctor<T> first_pool;
      first_pool(context, input, output);
      return;
    }
    auto lod = input.lod()[0];
+    if (pooltype == "SUM") {
+      auto place = context.GetPlace();
+      PADDLE_ENFORCE(platform::is_cpu_place(place));
+      const T* src = input.data<T>();
+      T* dst = output->mutable_data<T>(place);
+      jit::seq_pool_attr_t attr(
+          static_cast<int>(input.numel() / input.dims()[0]),
+          jit::SeqPoolType::kSum);
+      auto seqpool =
+          jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
+              attr);
+      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+        attr.h = static_cast<int>(lod[i + 1] - lod[i]);
+        seqpool(src, dst, &attr);
+        dst += attr.w;
+        src += attr.h * attr.w;
+      }
+      return;
+    }
    auto& place = *context.eigen_device();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
      Tensor in_t =
          input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
@@ -258,15 +277,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      auto out_e = EigenVector<T>::Flatten(out_t);
      if (pooltype == "AVERAGE") {
        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SUM") {
-        if (h > 0) {
-          const T* in_data = in_t.data<T>();
-          T* out_data = out_t.mutable_data<T>(context.GetPlace());
-          blas.VCOPY(w, in_data, out_data);
-          for (int64_t r = 1; r != h; ++r) {
-            blas.AXPY(w, 1., in_data + r * w, out_data);
-          }
-        }
      } else if (pooltype == "SQRT") {
        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                              std::sqrt(static_cast<T>(h));

--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -49,6 +49,7 @@ class SoftmaxGradCUDNNFunctor {
                  const framework::Tensor* Y, const framework::Tensor* y_grad,
                  framework::Tensor* x_grad);
 };
 #endif
 }  // namespace math

--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -23,5 +23,7 @@ limitations under the License. */
 #include "ops/binary_unnary_op.h"
 #include "ops/fill_constant_op.h"
+#include "ops/mean_op.h"
 #include "ops/mul_op.h"
+#include "ops/scale_op.h"
 #include "ops/top_k_op.h"
--- a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_NGRAPH
 #pragma once
 #include <string>
@@ -48,4 +47,3 @@ static void BuildUnaryNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
-#endif
--- a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+template <typename T>
+std::shared_ptr<ngraph::Node> ElementwiseScalar(
+    float scale, std::shared_ptr<ngraph::Node> node) {
+  auto node_shape = node->get_shape();
+  auto scale_const = ngraph::op::Constant::create(node->get_element_type(),
+                                                  node_shape, {scale});
+  return std::make_shared<T>(scale_const, node);
+}
+template <typename T>
+std::shared_ptr<ngraph::Node> ElementwiseScalar(
+    std::shared_ptr<ngraph::Node> scale_1d,
+    std::shared_ptr<ngraph::Node> node) {
+  auto scale_shape = scale_1d->get_shape();
+  PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node");
+  PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}");
+  auto node_shape = node->get_shape();
+  ngraph::AxisSet axis_set;
+  for (size_t i = 0; i < node_shape.size(); ++i) {
+    axis_set.insert(i);
+  }
+  node_shape.push_back(1);
+  auto scale_bcast =
+      std::make_shared<ngraph::op::Broadcast>(scale_1d, node_shape, axis_set);
+  auto scale_reshape =
+      paddle::platform::NgReshaper(scale_bcast, node->get_shape());
+  return std::make_shared<T>(scale_reshape, node);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_NGRAPH
 #pragma once
 #include <string>
@@ -58,4 +57,3 @@ void BuildFillConstantNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
-#endif
--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <functional>
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+void BuildMeanNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  ngraph::AxisSet axes;
+  for (size_t i = 0; i < input->get_shape().size(); ++i) {
+    axes.insert(i);
+  }
+  auto mean = ngraph::builder::mean(input, axes);
+  auto mean_1d = std::make_shared<ngraph::op::Reshape>(
+      mean, ngraph::AxisVector{}, ngraph::Shape{1});
+  paddle::platform::SetOutputNode(op, "Out", mean_1d, ngb_node_map);
+}
+void BuildMeanGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto x_shape = x->get_shape();
+  float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1,
+                                 std::multiplies<float>());
+  auto node_const = ngraph::op::Constant::create(og->get_element_type(),
+                                                 ngraph::Shape{1}, {x_size});
+  auto node_div = std::make_shared<ngraph::op::Divide>(og, node_const);
+  auto result = ElementwiseScalar<ngraph::op::Add>(
+      og / node_const,
+      ngraph::op::Constant::create(og->get_element_type(), x_shape, {0}));
+  paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_NGRAPH
 #pragma once
 #include <string>
@@ -131,4 +130,3 @@ static void BuildMulGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
-#endif
--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+void BuildScaleNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  float scale = op_attrs.Get<float>("scale");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto out = ElementwiseScalar<ngraph::op::Multiply>(scale, x);
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_NGRAPH
 #pragma once
 #include <string>
@@ -48,4 +47,3 @@ void BuildTopKNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
-#endif
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 #include "paddle/fluid/operators/py_func_op.h"
 #include <set>
 #include <string>
 #include <vector>
-#include "Python.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {

--- a/paddle/fluid/operators/py_func_op.h
+++ b/paddle/fluid/operators/py_func_op.h
@@ -13,8 +13,7 @@
 // limitations under the License.
 #pragma once
+#include "paddle/fluid/framework/python_headers.h"
-#include "pybind11/pybind11.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -58,12 +55,24 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
 }  // namespace
-static __device__ __forceinline__ float real_exp(float x) { return expf(x); }
+static __device__ __forceinline__ platform::float16 exp_on_device(
-static __device__ __forceinline__ double real_exp(double x) { return exp(x); }
+    platform::float16 x) {
-static __device__ __forceinline__ float real_log(float x) {
+  return ::Eigen::numext::exp(x);
+}
+static __device__ __forceinline__ float exp_on_device(float x) {
+  return expf(x);
+}
+static __device__ __forceinline__ double exp_on_device(double x) {
+  return exp(x);
+}
+static __device__ __forceinline__ platform::float16 log_on_device(
+    platform::float16 x) {
+  return math::TolerableValue<platform::float16>()(::Eigen::numext::log(x));
+}
+static __device__ __forceinline__ float log_on_device(float x) {
  return math::TolerableValue<float>()(logf(x));
 }
-static __device__ __forceinline__ double real_log(double x) {
+static __device__ __forceinline__ double log_on_device(double x) {
  return math::TolerableValue<double>()(log(x));
 }
@@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) {
 /*
  Supposing the x is `logits` and y is `labels`, the equations are as
 followings:
  cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
        = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
        = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
        = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
        = \sum_{j}(-y_i_j * tmp_i_j)
  softmax_i_j = e^{tmp_i_j}
 where:
  max_i = \max_{j}{x_i_j}
  logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
  tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
 Therefore, the calculation can be separated into 3 steps:
 Step 1: row-wise operation to calculate max_i
 Step 2: row-wise operation to calculate logDiffMaxSum_i
 Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
 To save memory, we can share memory among max_i, logDiffMaxSum_i and
 cross\_entropy_i.
 In this way, the 3 steps should be changed to:
@@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
  cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
  if (threadIdx.x == 0) {
-    max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max;
+    max_data[blockIdx.x] =
+        cur_max < static_cast<T>(-64) ? static_cast<T>(-64) : cur_max;
  }
 }
@@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
  auto block_max = max_data[blockIdx.x];
  softmax[beg_idx] = logits_data[beg_idx] - block_max;
-  T diff_max_sum = real_exp(softmax[beg_idx]);
+  T diff_max_sum = exp_on_device(softmax[beg_idx]);
  auto idx = beg_idx + BlockDim;
  while (idx < end_idx) {
    softmax[idx] = logits_data[idx] - block_max;
-    diff_max_sum += real_exp(softmax[idx]);
+    diff_max_sum += exp_on_device(softmax[idx]);
    idx += BlockDim;
  }
  diff_max_sum =
      BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
-  if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
+  if (threadIdx.x == 0) max_data[blockIdx.x] = log_on_device(diff_max_sum);
  if (!CalculateLogSoftmax) return;
  __syncthreads();
@@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy(
  // log_diff_max_sum shares memory with loss
  auto block_log_diff_max_sum = loss_data[blockIdx.x];
  auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
-  softmax[beg_idx] = real_exp(tmp);
+  softmax[beg_idx] = exp_on_device(tmp);
  auto loss = -labels_data[beg_idx] * tmp;
  beg_idx += BlockDim;
  while (beg_idx < end_idx) {
    tmp = softmax[beg_idx] - block_log_diff_max_sum;
-    softmax[beg_idx] = real_exp(tmp);
+    softmax[beg_idx] = exp_on_device(tmp);
    loss -= (labels_data[beg_idx] * tmp);
    beg_idx += BlockDim;
  }
@@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
    auto row_idx = idx / feature_size_;
    auto col_idx = idx % feature_size_;
    if (col_idx != labels_[row_idx]) {
-      log_softmax_[idx] = real_exp(log_softmax_[idx]);
+      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
    } else {
      auto softmax = log_softmax_[idx];
-      log_softmax_[idx] = real_exp(softmax);
+      log_softmax_[idx] = exp_on_device(softmax);
      loss_[row_idx] = -softmax;
    }
  }
@@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
    auto row_idx = idx / feature_size_;
    auto col_idx = idx % feature_size_;
    if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) {
-      log_softmax_[idx] = real_exp(log_softmax_[idx]);
+      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
    } else {
      auto softmax = log_softmax_[idx];
-      log_softmax_[idx] = real_exp(softmax);
+      log_softmax_[idx] = exp_on_device(softmax);
      loss_[row_idx] = -softmax;
    }
  }
@@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy,
+REGISTER_OP_CUDA_KERNEL(
-                        ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+    softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
-                        ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
+    ops::SoftmaxWithCrossEntropyCUDAKernel<paddle::platform::float16>,
-REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
-                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+REGISTER_OP_CUDA_KERNEL(
-                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
+    softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+    ops::SoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
+    ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -41,7 +41,9 @@ class SumOp : public framework::OperatorWithKernel {
      return;  // skip runtime infershape when is tensor array;
    }
+    auto x_var_types = ctx->GetInputsVarType("X");
    auto x_dims = ctx->GetInputsDim("X");
    size_t N = x_dims.size();
    PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
    if (N == 1) {
@@ -49,7 +51,13 @@ class SumOp : public framework::OperatorWithKernel {
    }
    framework::DDim in_dim({0});
-    for (auto& x_dim : x_dims) {
+    for (size_t i = 0; i < x_dims.size(); ++i) {
+      auto& x_dim = x_dims[i];
+      // x_dim.size() == 1 means the real dim of selected rows is [0]
+      if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS &&
+          x_dim.size() == 1) {
+        continue;
+      }
      if (framework::product(x_dim) == 0) {
        continue;
      }

--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+                      "The 2nd dimension of "
+                      "Input(Label) should be 1.");
+    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // teacher_student_sigmoid_loss
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+class TeacherStudentSigmoidLossGradientOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+    PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                      "When Attr(soft_label) == false, the 2nd dimension of "
+                      "Input(Label) should be 1.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // teacher_student_sigmoid_loss
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+class TeacherStudentSigmoidLossOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x 1],"
+             " where N is the batch size and D is the output. "
+             "This input is a probability computed by the previous operator, "
+             "which is almost always the result of a softmax operator.");
+    AddInput("Label",
+             "(Tensor), the ground truth which is a 2-D tensor. "
+             "Label is a Tensor<float> with shape [N x 1]. ");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "[N x 1]. The teacher student sigmoid loss.");
+    AddAttr<float>(
+        "soft_max_up_bound",
+        "fp32, if input > soft_max_up_bound, will be bound, default 15.0")
+        .SetDefault(15.0);
+    AddAttr<float>(
+        "soft_max_lower_bound",
+        "fp32, if input < soft_max_lower_bound, will be bound, default -15.0")
+        .SetDefault(-15.0);
+    AddComment(R"DOC(
+TeacherStudentSigmoidLoss Operator.
+It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
+we add another label(z') to original.
+        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
+        z is click or not
+        z' is teacher value 
+        label = {-2, -1, [0, 2]}
+        when z' is not exist, clk = 0 : label = -2;
+        when z' is not exist, clk = 1 : label = -1;
+        when z' is exist    , clk = 0 : label = 0 + z';
+        when z' is exist    , clk = 1 : label = 1 + z';
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(teacher_student_sigmoid_loss,
+                  ops::TeacherStudentSigmoidLossOp,
+                  ops::TeacherStudentSigmoidLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
+                  ops::TeacherStudentSigmoidLossGradientOp);
+REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss,
+                       ops::TeacherStudentSigmoidLossOpKernel<float>,
+                       ops::TeacherStudentSigmoidLossOpKernel<double>);
+REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss_grad,
+                       ops::TeacherStudentSigmoidLossGradOpKernel<float>,
+                       ops::TeacherStudentSigmoidLossGradOpKernel<double>);
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Tensor* y = context.Output<Tensor>("Y");
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    T* y_data = y->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    const T* label_data = labels->data<T>();
+    int64_t batch_size = x->dims()[0];
+    // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
+    // log(1 + exp(-abs(x)))
+    // z is click or not
+    // z' is value q of feed_fine
+    // label = {-2, -1, [0, 2]}
+    // when z' is not exist, clk = 0 : label = -2;
+    // when z' is not exist, clk = 1 : label = -1;
+    // when z' is exist    , clk = 0 : label = 0 + z';
+    // when z' is exist    , clk = 1 : label = 1 + z';
+    for (int i = 0; i < batch_size; ++i) {
+      if (label_data[i] < -1.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else if (label_data[i] < 0.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else if (label_data[i] < 1.0) {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
+                    log(1.0 + exp(-fabs(x_data[i]))) +
+                    (x_data[i] > 0 ? x_data[i] : 0.0) -
+                    x_data[i] * label_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      } else {
+        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
+                    log(1.0 + exp(-fabs(x_data[i]))) +
+                    (x_data[i] > 0 ? x_data[i] : 0.0) -
+                    x_data[i] * (label_data[i] - 1.0) +
+                    log(1.0 + exp(-fabs(x_data[i])));
+      }
+    }
+  }
+};
+template <typename T>
+class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const T* x_data = x->data<T>();
+    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(context.GetPlace());
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const T* label_data = labels->data<T>();
+    T soft_max_up_bound =
+        static_cast<T>(context.Attr<float>("soft_max_up_bound"));
+    T soft_max_lower_bound =
+        static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
+    int64_t batch_size = x->dims()[0];
+    const framework::Tensor* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    const T* dout_data = dOut->data<T>();
+    for (int i = 0; i < batch_size; ++i) {
+      T sum_val = x_data[i];
+      if (sum_val > soft_max_up_bound) {
+        sum_val = soft_max_up_bound;
+      } else {
+        if (sum_val < soft_max_lower_bound) {
+          sum_val = soft_max_lower_bound;
+        }
+      }
+      T pred = 1.0 / (1.0 + exp(-sum_val));
+      if (label_data[i] < -1.0) {
+        dx_data[i] = 0.0 - pred;
+      } else if (label_data[i] < 0.0) {
+        dx_data[i] = 1.0 - pred;
+      } else {
+        dx_data[i] = label_data[i] - 2.0 * pred;
+      }
+      if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
+        dx_data[i] = 0;
+      }
+      dx_data[i] *= dout_data[i] * -1;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -35,20 +35,8 @@ limitations under the License. */
 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
              "Default use 100% of CPU memory for PaddlePaddle,"
              "reserve the rest for page tables, etc");
-#if !defined(_WIN32)
-DEFINE_uint64(initial_cpu_memory_in_mb,
-#ifdef PADDLE_WITH_MKLDNN
-              /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
-               * to obtain the best performance*/
-              5000ul,
-#else
-              500ul,
-#endif
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
-#else
 DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
              "Initial CPU memory for PaddlePaddle, in MD unit.");
-#endif  // !defined(_WIN32)
 DEFINE_double(
    fraction_of_cuda_pinned_memory_to_use, 0.5,

--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -15,6 +15,9 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <iostream>
+#ifdef _WIN32
+#include <numeric>
+#endif
 #include <random>
 #define PADDLE_CUDA_FP16

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -92,26 +92,24 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
    const platform::Place& place, const cudaStream_t& stream) {
  PADDLE_ENFORCE(platform::is_gpu_place(place));
  auto place_stream = std::make_pair(place, stream);
-  {
+  std::unique_lock<std::mutex> lock(mtx_);
-    std::unique_lock<std::mutex> lock(mtx_);
+  auto it = device_allocator_.find(place_stream);
-    if (!device_allocator_.count(place_stream)) {
+  if (it == device_allocator_.end()) {
-      device_allocator_[place_stream].reset(new TemporaryAllocator(place));
+    auto tmp_allocator = new TemporaryAllocator(place);
-      device_allocator_[place_stream]->SetCallback([stream]() {
+    tmp_allocator->SetCallback([stream]() {
-        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-        PADDLE_ENFORCE(cudaGetLastError());
+      PADDLE_ENFORCE(cudaGetLastError());
-      });
+    });
-    }
+    device_allocator_[place_stream].reset(tmp_allocator);
+    return *tmp_allocator;
+  } else {
+    return *it->second;
  }
-  return *device_allocator_.at(place_stream);
 }
 template <>
 platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
    const platform::CUDADeviceContext& dev_ctx) {
-  auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream());
-  if (device_allocator_.count(place_stream)) {
-    return *device_allocator_.at(place_stream);
-  }
  return Get(dev_ctx.GetPlace(), dev_ctx.stream());
 }
 #endif
@@ -292,7 +290,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
    if (dynload::HasCUDNN()) {
      auto local_cudnn_version = cudnn_dso_ver / 100;
      auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cuda_version < compile_cuda_version) {
+      if (local_cudnn_version < compile_cudnn_version) {
        LOG_FIRST_N(WARNING, 1)
            << "WARNING: device: " << place_.device
            << ". The installed Paddle is compiled with CUDNN "
@@ -325,7 +323,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
 void CUDADeviceContext::Wait() const {
  auto& allocator =
      DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this);
-  allocator.Release([=]() {
+  allocator.Release([this]() {
    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
    PADDLE_ENFORCE(cudaGetLastError());
  });

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -61,7 +61,7 @@ namespace platform {
 * the allocations of temp_allocation_queue:
 *  - when the Stream calls cudaStreamSynchronize;
 *  - when the allocation size of opportunities exceeds a certain threshold
- *    (defined by FLAGS_limit_of_temporary_allocation).
+ *    (defined by FLAGS_limit_of_tmp_allocation).
 *
 * */
 class DeviceTemporaryAllocator {

--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -59,7 +59,7 @@ limitations under the License. */
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
-#define PADDLE_ALIGN(x) /*do nothing*/
+#define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
 namespace paddle {

--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -271,11 +271,13 @@ TEST(float16, isinf) {
  float16 b = float16(INFINITY);
  // underflow to 0
  float16 native_a(5e-40f);
-  // overflow to inf
-  float16 native_b(5e40f);
  EXPECT_EQ(std::isinf(a), true);
  EXPECT_EQ(std::isinf(b), true);
+#ifndef _WIN32
+  // overflow to inf
+  float16 native_b(5e40f);
  EXPECT_EQ(std::isinf(native_b), true);
+#endif
  EXPECT_EQ(native_a, float16(0));
 }

--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -210,13 +210,15 @@ class MKLDNNHandler {
    dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<T>(output_data)));
  }
-  static void AppendKey(
+  static void AppendKey(std::string* key,
-      std::string* key, const mkldnn::memory::dims& input_dims,
+                        const mkldnn::memory::dims& input_dims,
-      const mkldnn::memory::dims& weights_dims, const std::vector<int>& strides,
+                        const mkldnn::memory::dims& weights_dims,
-      const std::vector<int>& paddings, const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
-      const int& groups, const mkldnn::memory::data_type& srcdt,
+                        const std::vector<int>& paddings,
-      const mkldnn::memory::format& format,
+                        const std::vector<int>& dilations, const int& groups,
-      const mkldnn::memory::data_type& dstdt, const std::string& suffix) {
+                        const mkldnn::memory::data_type& srcdt,
+                        const mkldnn::memory::format& format, const bool& relu,
+                        const bool& residual, const std::string& suffix) {
    AppendKeyDims(key, input_dims);
    AppendKeyDims(key, weights_dims);
    AppendKeyVec(key, strides);
@@ -225,7 +227,8 @@ class MKLDNNHandler {
    AppendKey(key, std::to_string(groups));
    AppendKey(key, std::to_string(srcdt));
    AppendKey(key, std::to_string(format));
-    AppendKey(key, std::to_string(dstdt));
+    AppendKey(key, std::to_string(relu));
+    AppendKey(key, std::to_string(residual));
    AppendKey(key, suffix);
  }
@@ -664,15 +667,35 @@ static std::shared_ptr<mkldnn::memory> SetDstMemory(
 }
 template <typename T>
-static std::shared_ptr<mkldnn::memory> SetDstMemoryHandler(
+static std::shared_ptr<mkldnn::memory> SetDstMemory(
    const framework::ExecutionContext& ctx, framework::Tensor* output,
-    const std::shared_ptr<ConvMKLDNNHandler>& handler) {
+    const framework::Tensor* residual_param,
+    const mkldnn::memory::desc& user_residual_md,
+    const std::shared_ptr<ConvMKLDNNHandler>& handler,
+    std::vector<mkldnn::primitive>* pipeline) {
+  const T* residual_param_data = residual_param->data<T>();
+  PADDLE_ENFORCE(residual_param_data != nullptr,
+                 "Provide data if you want MKLDNN conv+elementwise_add fusion");
+  std::shared_ptr<mkldnn::memory> user_residual_memory_p =
+      handler->AcquireResidualDataMemory(user_residual_md,
+                                         to_void_cast<T>(residual_param_data));
+  T* output_data = output->mutable_data<T>(ctx.GetPlace());
+  std::shared_ptr<mkldnn::memory> dst_memory_p =
+      handler->AcquireDstMemoryFromResidualDataMemory(
+          user_residual_memory_p, to_void_cast<T>(output_data), *pipeline);
+  return dst_memory_p;
+}
+template <typename T>
+static void SetDstMemoryHandler(
+    const framework::ExecutionContext& ctx, framework::Tensor* output,
+    const std::shared_ptr<ConvMKLDNNHandler>& handler,
+    std::shared_ptr<mkldnn::memory>* dst_memory_p) {
  T* output_data = output->mutable_data<T>(
      ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
      handler->GetDstMemorySize());
-  std::shared_ptr<mkldnn::memory> dst_memory_p;
+  (*dst_memory_p)->set_data_handle(to_void_cast<T>(output_data));
-  dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
-  return dst_memory_p;
 }
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -15,8 +15,15 @@
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-DEFINE_double(limit_of_temporary_allocation, -1,
+DEFINE_int64(limit_of_tmp_allocation, -1,
-              "The up limit of temporary_allocation size.");
+             "The up limit of temporary_allocation size.");
+DEFINE_double(times_excess_than_required_tmp_allocation, 2,
+              "times_excess_than_required_tmp_allocation indicates the "
+              "max size the TemporaryAllocator can return. For example, "
+              "if the required memory size is N, and "
+              "times_excess_than_required_tmp_allocation is 2.0, "
+              "the TemporaryAllocator will return the available allocation "
+              "that the range of size is N ~ 2*N.");
 namespace paddle {
 namespace platform {
@@ -29,24 +36,25 @@ TemporaryAllocation::TemporaryAllocation(
      underlying_allocation_(std::move(underlying_allocation)) {}
 TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
-  temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+  temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
 }
 bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
 void TemporaryAllocator::Release(const std::function<void()> &callback) {
-  std::shared_ptr<std::deque<TemporaryAllocation *>> t_allocations;
+  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
  {
    std::unique_lock<std::mutex> lock(mtx_);
    callback();
-    t_allocations = temp_mem_queue_;
+    t_allocations.swap(temp_mem_map_);
-    temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+    temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
    wait_delete_mem_ = 0;
  }
  for (auto tmp : *t_allocations) {
-    VLOG(10) << "Delete temporary allocation " << tmp->ptr()
+    VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
-             << " size: " << tmp->size();
+             << " size: " << tmp.second->size();
-    delete tmp;
+    delete tmp.second;
  }
 }
@@ -54,28 +62,34 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) {
  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
  if (platform::is_gpu_place(temp_allocation->place())) {
+    PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
+                   "The place should be the same.");
    size_t wait_delete_mem = 0;
    {
      std::unique_lock<std::mutex> lock(mtx_);
-      temp_mem_queue_->emplace_back(temp_allocation);
+      temp_mem_map_->emplace(temp_allocation->size(), temp_allocation);
      wait_delete_mem_ += temp_allocation->size();
      wait_delete_mem = wait_delete_mem_;
      VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr()
               << " to delete queue: " << temp_allocation->size() << "; "
-               << "wait_delete_mem: " << wait_delete_mem_;
+               << "wait_delete_mem: " << wait_delete_mem;
    }
-    if (FLAGS_limit_of_temporary_allocation > 0 &&
-        wait_delete_mem > FLAGS_limit_of_temporary_allocation) {
+    if (FLAGS_limit_of_tmp_allocation > 0 &&
+        wait_delete_mem > static_cast<size_t>(FLAGS_limit_of_tmp_allocation)) {
+      PADDLE_ENFORCE(callback_ != nullptr, "The callback is non-initialized.");
      Release(callback_);
    }
    return;
  }
+  VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
+           << " size: " << temp_allocation->size();
  delete temp_allocation;
 }
 size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
  std::unique_lock<std::mutex> lock(mtx_);
-  return temp_mem_queue_ ? temp_mem_queue_->size() : 0;
+  return temp_mem_map_ ? temp_mem_map_->size() : 0;
 }
 void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
@@ -84,6 +98,27 @@ void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
 alloc::Allocation *TemporaryAllocator::AllocateImpl(
    size_t size, alloc::Allocator::Attr attr) {
+  {
+    // Find available allocation in temp_mem_map.
+    std::unique_lock<std::mutex> lock(mtx_);
+    if (temp_mem_map_->size()) {
+      auto it = temp_mem_map_->lower_bound(size);
+      // FIXME(zcd): Not sure the best value of excess fraction.
+      if (it != temp_mem_map_->end() &&
+          it->first <
+              static_cast<size_t>(
+                  size * FLAGS_times_excess_than_required_tmp_allocation)) {
+        auto tmp_ptr = it->second;
+        temp_mem_map_->erase(it);
+        wait_delete_mem_ -= tmp_ptr->size();
+        VLOG(10) << "Reuse temporary allocation: " << tmp_ptr->ptr() << ": "
+                 << tmp_ptr->size();
+        return tmp_ptr;
+      }
+    }
+  }
+  // If not find the the available allocation, get allocation from
+  // AllocatorFacadeInstance.
  auto raw_allocation =
      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));

--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <condition_variable>  // NOLINT
 #include <deque>
+#include <map>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
@@ -39,7 +40,7 @@ class TemporaryAllocation : public memory::allocation::Allocation {
 *
 * There is one opportunity to free the allocations of temp_allocation_queue:
 *   - when the allocation size of opportunities exceeds a certain threshold
- *     (defined by FLAGS_limit_of_temporary_allocation).
+ *     (defined by FLAGS_limit_of_tmp_allocation).
 *
 * */
 class TemporaryAllocator : public memory::allocation::Allocator {
@@ -62,11 +63,10 @@ class TemporaryAllocator : public memory::allocation::Allocator {
 private:
  platform::Place place_;
  // When the allocation is not held by any variable, it should be placed
-  // to temp_mem_queue immediately.
+  // to temp_mem_map immediately.
-  std::shared_ptr<std::deque<TemporaryAllocation *>> temp_mem_queue_{nullptr};
+  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
+      nullptr};
  std::mutex mtx_;
  size_t wait_delete_mem_{0};
  std::function<void()> callback_;

--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -18,7 +18,8 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
-DECLARE_double(limit_of_temporary_allocation);
+DECLARE_int64(limit_of_tmp_allocation);
+DECLARE_double(times_excess_than_required_tmp_allocation);
 namespace paddle {
 namespace platform {
@@ -35,7 +36,7 @@ class DummyOp : public framework::OperatorBase {
               const platform::Place& place) const override {}
 };
-TEST(temporary_allocator, temporary_allocator) {
+TEST(temporary_allocator, test_base_function) {
  platform::CPUPlace cpu_place;
  TemporaryAllocator alloc(cpu_place);
  alloc.Allocate(100);
@@ -59,10 +60,10 @@ TEST(temporary_allocator, temporary_allocator) {
 #endif
 }
-TEST(temporary_allocator, add_callback) {
+TEST(temporary_allocator, test_flags_function) {
 #ifdef PADDLE_WITH_CUDA
-  const double limit = FLAGS_limit_of_temporary_allocation;
+  const int64_t limit = FLAGS_limit_of_tmp_allocation;
-  FLAGS_limit_of_temporary_allocation = 10;
+  FLAGS_limit_of_tmp_allocation = 10;
  platform::CUDAPlace gpu_place(0);
  TemporaryAllocator gpu_alloc(gpu_place);
@@ -78,7 +79,52 @@ TEST(temporary_allocator, add_callback) {
  });
  { gpu_alloc.Allocate(100); }
  PADDLE_ENFORCE(deleted);
-  FLAGS_limit_of_temporary_allocation = limit;
+  FLAGS_limit_of_tmp_allocation = limit;
+#endif
+}
+TEST(temporary_allocator, test_reuse_tmp_allocation) {
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  gpu_alloc.SetCallback([]() {});
+  void* tmp_allocation_ptr1 = nullptr;
+  {
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+    auto tmp_allocation1 = gpu_alloc.Allocate(100);
+    tmp_allocation_ptr1 = tmp_allocation1->ptr();
+  }
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  auto tmp_allocation2 = gpu_alloc.Allocate(100);
+  void* tmp_allocation_ptr2 = tmp_allocation2->ptr();
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2);
+  auto tmp_allocation3 = gpu_alloc.Allocate(100);
+  void* tmp_allocation_ptr3 = tmp_allocation2->ptr();
+  PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr3);
+#endif
+}
+TEST(temporary_allocator, test_times_excess_than_required_tmp_allocation) {
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  gpu_alloc.SetCallback([]() {});
+  double excess_fraction = FLAGS_times_excess_than_required_tmp_allocation;
+  void* tmp_allocation_ptr1 = nullptr;
+  {
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+    auto tmp_allocation1 =
+        gpu_alloc.Allocate(static_cast<size_t>(100 * excess_fraction - 1));
+    tmp_allocation_ptr1 = tmp_allocation1->ptr();
+  }
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  auto tmp_allocation2 = gpu_alloc.Allocate(100);
+  void* tmp_allocation_ptr2 = tmp_allocation2->ptr();
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2);
 #endif
 }

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool)
+  feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
+  tracer)
 if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
@@ -21,9 +22,8 @@ if(WITH_PYTHON)
    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
  endif(WITH_AMD_GPU)
-  if(WIN32)
+  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(paddle_pybind shlwapi)
+  target_link_libraries(paddle_pybind ${os_dependency_modules})
-  endif(WIN32)
  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -26,7 +26,9 @@ void BindTracer(pybind11::module *m) {
           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
             new (&self) imperative::Tracer(root_block);
           })
-      .def("trace", &imperative::Tracer::Trace);
+      .def("trace", &imperative::Tracer::Trace)
+      .def("py_trace", &imperative::Tracer::PyTrace,
+           pybind11::return_value_policy::take_ownership);
 }
 }  // namespace pybind

--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
-class PyLayer : public imperative::Layer {
+class Layer : public imperative::Layer {
 public:
  using imperative::Layer::Layer;  // Inherit constructors
@@ -31,10 +31,6 @@ class PyLayer : public imperative::Layer {
    PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
                      inputs);  // NOLINT
  }
-  void Backward() override {
-    PYBIND11_OVERLOAD(void, Layer, Backward, );  // NOLINT
-  }
 };
 class PyOpBase : public imperative::OpBase {

--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/pybind/ir.h"
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "pybind11/stl.h"
+namespace py = pybind11;
+using paddle::framework::ir::Graph;
+using paddle::framework::ir::Node;
+using paddle::framework::OpDesc;
+using paddle::framework::ProgramDesc;
+using paddle::framework::VarDesc;
+using pybind11::return_value_policy;
+namespace paddle {
+namespace pybind {
+void BindGraph(py::module *m) {
+  py::class_<Graph, std::shared_ptr<Graph>>(
+      *m, "Graph",
+      "The graph is a Directed Acyclic Single Static Assignment Graph, see "
+      "`paddle::ir::Graph` for details.")
+      .def(py::init<const ProgramDesc &>())
+      .def("has", &Graph::Has)
+      .def("get_int", &Graph::Get<int>)
+      .def("get_float", &Graph::Get<float>)
+      .def("get_double", &Graph::Get<double>)
+      .def("get_string", &Graph::Get<std::string>)
+      .def("set", [](Graph &self, const std::string &attr_name,
+                     int attr) { return self.Set(attr_name, new int(attr)); })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const std::string &attr) {
+             return self.Set(attr_name, new std::string(attr));
+           })
+      .def("set",
+           [](Graph &self, const std::string &attr_name, float attr) {
+             return self.Set(attr_name, new float(attr));
+           })
+      .def("set",
+           [](Graph &self, const std::string &attr_name, double attr) {
+             return self.Set(attr_name, new double(attr));
+           })
+      .def("erase", &Graph::Erase)
+      .def("nodes", &Graph::Nodes, return_value_policy::reference)
+      .def("create_var_node",
+           [](Graph &self, VarDesc &var_desc) {
+             return self.CreateVarNode(&var_desc);
+           },
+           return_value_policy::reference)
+      .def("create_op_node",
+           [](Graph &self, OpDesc &op_desc) {
+             return self.CreateOpNode(&op_desc);
+           },
+           return_value_policy::reference)
+      .def("create_control_dep_var", &Graph::CreateControlDepVar,
+           return_value_policy::reference)
+      .def("create_empty_node", &Graph::CreateEmptyNode,
+           return_value_policy::reference)
+      .def("release_nodes", &Graph::ReleaseNodes)
+      .def("remove_node",
+           [](Graph &self, Node &node) { return self.RemoveNode(&node); })
+      .def("retrieve_node", &Graph::RetrieveNode,
+           return_value_policy::reference)
+      .def("resolve_hazard", &Graph::ResolveHazard);
+}
+void BindNode(py::module *m) {
+  py::class_<Node> node(*m, "Node");
+  node.def("name", &Node::Name)
+      .def("node_type", &Node::NodeType)
+      .def("var", &Node::Var)
+      .def("op", &Node::Op)
+      .def("id", &Node::id)
+      .def("is_op", &Node::IsOp)
+      .def("is_var", &Node::IsVar)
+      .def("is_ctrl_var", &Node::IsCtrlVar)
+      .def_readwrite("inputs", &Node::inputs)
+      .def_readwrite("outputs", &Node::outputs);
+  py::enum_<Node::Type>(node, "Type")
+      .value("Operation", Node::Type::kOperation)
+      .value("Variable", Node::Type::kVariable)
+      .export_values();
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/ir.h
+++ b/paddle/fluid/pybind/ir.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <pybind11/pybind11.h>
+#include "paddle/fluid/framework/ir/graph.h"
+namespace paddle {
+namespace pybind {
+void BindGraph(pybind11::module *m);
+void BindNode(pybind11::module *m);
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -49,6 +49,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
@@ -125,26 +126,18 @@ PYBIND11_MODULE(core, m) {
  m.add_object("_cleanup",
               py::capsule([]() { ScopePool::Instance().Clear(); }));
-  py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
+  py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
-      m, "VarBase", R"DOC()DOC")
      // .def(py::init<>())
      .def(py::init<bool>(), py::arg("stop_gradient") = false)
      .def("_run_backward",
           [](imperative::VarBase &self) { self.RunBackward(); })
      .def("_grad_name", &imperative::VarBase::GradName)
-      .def("_grad", &imperative::VarBase::Grad)
+      .def("_grad_value", &imperative::VarBase::GradValue)
-      .def_property("grad_value",
+      .def("_grad_ivar",
-                    [](const imperative::VarBase &self) { return self.grads_; },
+           [](const imperative::VarBase &self) { return self.grads_; },
-                    [](imperative::VarBase &self, framework::Variable *grad) {
+           py::return_value_policy::reference)
-                      self.grads_ = grad;
+      .def("value", [](const imperative::VarBase &self) { return self.var_; },
-                    },
+           py::return_value_policy::reference)
-                    py::return_value_policy::reference)
-      .def_property("value",
-                    [](const imperative::VarBase &self) { return self.var_; },
-                    [](imperative::VarBase &self, framework::Variable *var) {
-                      self.var_ = var;
-                    },
-                    py::return_value_policy::reference)
      .def_property(
          "desc",
          [](const imperative::VarBase &self) { return self.var_desc_; },
@@ -168,16 +161,44 @@ PYBIND11_MODULE(core, m) {
              self.op_desc_ = op_desc;
            }
          },
+          py::return_value_policy::reference)
+      .def_property(
+          "forward_id",
+          [](const imperative::OpBase &self) { return self.forward_id_; },
+          [](imperative::OpBase &self, int forward_id) {
+            self.forward_id_ = forward_id;
+          },
+          py::return_value_policy::reference)
+      .def_property(
+          "backward_id",
+          [](const imperative::OpBase &self) { return self.backward_id_; },
+          [](imperative::OpBase &self, int backward_id) {
+            self.backward_id_ = backward_id;
+          },
          py::return_value_policy::reference);
-  py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
+  py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
  layer.def(py::init<>())
-      .def("forward",
+      .def("forward", [](imperative::Layer &self,
-           [](imperative::Layer &self,
+                         const std::vector<imperative::VarBase> &inputs) {
-              const std::vector<imperative::VarBase> &inputs) {
+        return self.Forward(inputs);
-             return self.Forward(inputs);
+      });
-           })
-      .def("backward", &imperative::Layer::Backward);
+  py::class_<imperative::PyLayer>(m, "PyLayer")
+      .def(py::init<>())
+      .def_static(
+          "apply",
+          [](int func_id, const std::vector<imperative::VarBase *> &inputs)
+              -> std::vector<imperative::VarBase *> {
+                return imperative::PyLayer::Apply(func_id, inputs);
+              },
+          py::return_value_policy::take_ownership)
+      .def_static("register_func",
+                  [](int func_id, const py::object &callable) {
+                    imperative::PyLayer::RegisterFunc(func_id, callable);
+                  })
+      .def_static("num_funcs", &imperative::PyLayer::NumFuncs);
  BindTracer(&m);
  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
@@ -775,7 +796,12 @@ All parameter, weight, gradient are variables in Paddle.
          })
      .def("set_int", [](ir::Pass &self, const std::string &name,
                         int val) { self.Set<const int>(name, new int(val)); })
-      .def("type", &ir::Pass::Type);
+      .def("type", &ir::Pass::Type)
+      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
+        std::unique_ptr<ir::Graph> origin_graph(graph.get());
+        auto optim_graph = self.Apply(std::move(origin_graph));
+        graph.reset(optim_graph.release());
+      });
  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
      m, "PassBuilder");
@@ -1019,8 +1045,7 @@ All parameter, weight, gradient are variables in Paddle.
  pe.def(py::init<const std::vector<platform::Place> &,
                  const std::unordered_set<std::string> &, const ProgramDesc &,
                  const std::string &, Scope *, std::vector<Scope *> &,
-                  const ExecutionStrategy &, const BuildStrategy &, size_t,
+                  const ExecutionStrategy &, const BuildStrategy &>())
-                  size_t>())
      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
      // We still cannot get local_scope from this vector, since the element
      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
@@ -1043,6 +1068,9 @@ All parameter, weight, gradient are variables in Paddle.
  BindRecordIOWriter(&m);
  BindAsyncExecutor(&m);
+  BindGraph(&m);
+  BindNode(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -490,7 +490,8 @@ function assert_api_spec_approvals() {
        BRANCH="develop"
    fi
-    API_FILES=("paddle/fluid/API.spec"
+    API_FILES=("cmake/external"
+               "paddle/fluid/API.spec"
               "paddle/fluid/framework/operator.h"
               "paddle/fluid/framework/tensor.h"
               "paddle/fluid/framework/lod_tensor.h"

--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -21,10 +21,9 @@ parse training set and test set into paddle reader creators.
 from __future__ import print_function
 import paddle.dataset.common
-import subprocess
+import gzip
 import numpy
-import platform
+import struct
-import tempfile
 from six.moves import range
 __all__ = ['train', 'test', 'convert']
@@ -41,51 +40,47 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
 def reader_creator(image_filename, label_filename, buffer_size):
    def reader():
-        if platform.system() == 'Darwin':
+        with gzip.GzipFile(image_filename, 'rb') as image_file:
-            zcat_cmd = 'gzcat'
+            img_buf = image_file.read()
-        elif platform.system() == 'Linux':
+            with gzip.GzipFile(label_filename, 'rb') as label_file:
-            zcat_cmd = 'zcat'
+                lab_buf = label_file.read()
-        else:
-            raise NotImplementedError()
+                step_label = 0
-        # According to http://stackoverflow.com/a/38061619/724872, we
+                offset_img = 0
-        # cannot use standard package gzip here.
+                # read from Big-endian
-        tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
+                # get file info from magic byte
-        m = subprocess.Popen(
+                # image file : 16B
-            [zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
+                magic_byte_img = '>IIII'
-        tmp_image_file.seek(16)  # skip some magic bytes
+                magic_img, image_num, rows, cols = struct.unpack_from(
+                    magic_byte_img, img_buf, offset_img)
-        # Python3 will not take stdout as file
+                offset_img += struct.calcsize(magic_byte_img)
-        tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
-        l = subprocess.Popen(
+                offset_lab = 0
-            [zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
+                # label file : 8B
-        tmp_label_file.seek(8)  # skip some magic bytes
+                magic_byte_lab = '>II'
+                magic_lab, label_num = struct.unpack_from(magic_byte_lab,
-        try:  # reader could be break.
+                                                          lab_buf, offset_lab)
-            while True:
+                offset_lab += struct.calcsize(magic_byte_lab)
-                labels = numpy.fromfile(
-                    tmp_label_file, 'ubyte', count=buffer_size).astype("int")
+                while True:
+                    if step_label >= label_num:
-                if labels.size != buffer_size:
+                        break
-                    break  # numpy.fromfile returns empty slice after EOF.
+                    fmt_label = '>' + str(buffer_size) + 'B'
+                    labels = struct.unpack_from(fmt_label, lab_buf, offset_lab)
-                images = numpy.fromfile(
+                    offset_lab += struct.calcsize(fmt_label)
-                    tmp_image_file, 'ubyte', count=buffer_size * 28 *
+                    step_label += buffer_size
-                    28).reshape((buffer_size, 28 * 28)).astype('float32')
+                    fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
-                images = images / 255.0 * 2.0 - 1.0
+                    images_temp = struct.unpack_from(fmt_images, img_buf,
+                                                     offset_img)
-                for i in range(buffer_size):
+                    images = numpy.reshape(images_temp, (
-                    yield images[i, :], int(labels[i])
+                        buffer_size, rows * cols)).astype('float32')
-        finally:
+                    offset_img += struct.calcsize(fmt_images)
-            try:
-                m.terminate()
+                    images = images / 255.0 * 2.0 - 1.0
-            except:
+                    for i in range(buffer_size):
-                pass
+                        yield images[i, :], int(labels[i])
-            try:
-                l.terminate()
-            except:
-                pass
    return reader

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -155,7 +155,8 @@ def __bootstrap__():
            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
-            'cudnn_exhaustive_search_times', 'sync_nccl_allreduce'
+            'sync_nccl_allreduce', 'limit_of_tmp_allocation',
+            'times_excess_than_required_tmp_allocation'
        ]
    core.init_gflags([sys.argv[0]] +

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing
+import os
+import six
+import sys
+from .. import compat as cpt
+from . import core
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+BuildStrategy = core.ParallelExecutor.BuildStrategy
+def _place_obj(place):
+    p = core.Place()
+    p.set_place(place)
+    return p
+class CompiledProgram(object):
+    """
+    Compiles a Program for execution.
+    1. Users first create the program with layers.
+    2. Optionally, users use CompiledProgram to optimize the program before run.
+    3. The original program or CompiledProgram is run by executor.
+    The CompiledProgram is used to transform a program for various
+    optimizations, for example.
+      * Pre-compute some logic once so that each run is faster.
+      * Transform the program so that it can run in multiple devices.
+      * TODO: transform the program for optimized inference or distributed
+              training.
+    Example:
+        .. code-block:: python
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
+                loss_name=loss.name)
+            for i in range(5):
+                test_loss, = exe.run(compiled_prog,
+                                     feed=feed_dict,
+                                     fetch_list=[loss.name])
+    Args:
+        program: Program instance that contains the model logic.
+    """
+    def __init__(self, program):
+        self._program = program
+        self._scope = None
+        self._place = None
+        self._executor = None
+        self._compiled = False
+        self._is_data_parallel = False
+    def with_data_parallel(self,
+                           loss_name=None,
+                           build_strategy=None,
+                           exec_strategy=None,
+                           share_vars_from=None):
+        """Configs the program to run in data parallel way.
+        Args:
+            loss_name (str): The loss name must set in training. Default None.
+            build_strategy(BuildStrategy): build_strategy is used to
+                build the graph so it can run on multiple devices/cores with
+                optimized topology.
+                For more information, please refer to fluid.BuildStrategy.
+                Default None.
+            exec_strategy(ExecutionStrategy): exec_strategy is used to
+                to select the a way to execute the graph, for example how many
+                threads are used, how many iterations to clean up the temp
+                variables. For more information, please refer
+                to fluid.ExecutionStrategy. Default None.
+            share_vars_from(CompiledProgram): If provide, this CompiledProgram
+                will share variables from `share_vars_from`. `share_vars_from`
+                must be run by the executor before this CompiledProgram so that
+                vars are ready.
+        Returns:
+            self
+        """
+        assert not self._is_data_parallel, "Already compiled with parallel."
+        self._is_data_parallel = True
+        self._build_strategy = build_strategy
+        self._exec_strategy = exec_strategy
+        self._loss_name = loss_name
+        self._share_vars_from = share_vars_from
+        if self._exec_strategy is None:
+            self._exec_strategy = ExecutionStrategy()
+        if self._build_strategy is None:
+            self._build_strategy = BuildStrategy()
+        return self
+    def _with_distributed(self):
+        raise NotImplementedError()
+    def _with_inference_optimize(self):
+        raise NotImplementedError()
+    def _compile_data_parallel(self):
+        if self._share_vars_from:
+            if self._scope:
+                sys.stderr.write("share_vars_from is set, scope is ignored.\n")
+            if not self._share_vars_from._is_data_parallel:
+                raise ValueError("share_vars_from is not data parallel. Cannot "
+                                 "share vars from it.")
+            if self._share_vars_from._executor is None:
+                raise ValueError(
+                    "share_vars_from is not compiled and run, so there is no "
+                    "var to share.")
+            self._local_scopes = self._share_vars_from._executor.local_scopes()
+        else:
+            self._local_scopes = []
+        self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace)
+        if self._exec_strategy.use_cuda:
+            gpus_env = os.getenv("FLAGS_selected_gpus")
+            if gpus_env:
+                gpus = [int(s) for s in gpus_env.split(",")]
+            else:
+                gpus = [
+                    i for i in six.moves.range(core.get_cuda_device_count())
+                ]
+            self._places = [core.CUDAPlace(i) for i in gpus]
+        else:
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
+        assert self._places, "no place for execution"
+        if self._exec_strategy.num_threads == 0:
+            if self._exec_strategy.use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                self._exec_strategy.num_threads = len(self._places) * 4
+            else:
+                cpu_num = int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                self._exec_strategy.num_threads = cpu_num * 2
+        trainers_endpoints = self._program._trainers_endpoints
+        if self._build_strategy.num_trainers > 1 and trainers_endpoints:
+            assert self._build_strategy.num_trainers == len(
+                trainers_endpoints), "num_trainers == len(end_points)"
+            self._build_strategy.trainers_endpoints = trainers_endpoints
+        self._persistable_vars = set([
+            cpt.to_text(v.name)
+            for v in [
+                var for var in self._program.list_vars()
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
+            ]
+        ])
+        places = list(map(_place_obj, self._places))
+        return core.ParallelExecutor(
+            places, self._persistable_vars, self._program.desc,
+            cpt.to_text(self._loss_name)
+            if self._loss_name else six.u(''), self._scope, self._local_scopes,
+            self._exec_strategy, self._build_strategy)
+    def _compile(self, scope, place):
+        """Compile the program based on the configs.
+        Args:
+            scope: The variables (resources) that are associated with
+               this compiled program.
+            place: The location that the compiled program will be run on.
+        Returns:
+            self
+        """
+        if self._compiled:
+            if scope and self._scope != scope:
+                raise ValueError("Cannot compile with different scope")
+            if place and self._place != place:
+                raise ValueError("Cannot compile with different place")
+            return self
+        self._compiled = True
+        self._scope = scope
+        self._place = place
+        if self._is_data_parallel:
+            self._executor = self._compile_data_parallel()
+        else:
+            p = _place_obj(self._place)
+            self._executor = core.Executor(p)
+        return self
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -71,10 +71,25 @@ class DataToLoDTensorConverter(object):
            for each_data in data:
                self._feed_impl_(each_data, lod[1:], lod_level - 1)
+    def _check_shape(self, shape):
+        for s1, s2 in zip(self.shape, shape):
+            if s1 != s2 and s1 >= 0 and s2 >= 0:
+                raise ValueError(
+                    "Shape not match. What is defined in data layer is {}, but receive {}".
+                    format(self.shape, shape))
    def done(self):
        arr = numpy.array(self.data, dtype=self.dtype)
-        if self.shape and len(arr.shape) != len(self.shape):
+        if self.shape:
-            arr = arr.reshape(self.shape)
+            if len(arr.shape) != len(self.shape):
+                try:
+                    arr = arr.reshape(self.shape)
+                except ValueError:
+                    raise ValueError(
+                        "Reshape error. What is defined in data layer is {}, but receive {}"
+                        .format(self.shape, arr.shape))
+            else:
+                self._check_shape(arr.shape)
        t = core.LoDTensor()
        t.set(arr, self.place)
        if self.lod_level > 0:
@@ -152,17 +167,8 @@ class DataFeeder(object):
                raise TypeError("Feed list should contain a list of variable")
            self.feed_dtypes.append(each_var.dtype)
            self.feed_names.append(each_var.name)
-            shape = each_var.shape
-            batch_size_dim = -1
-            for i, s in enumerate(shape):
-                if s < 0:
-                    batch_size_dim = i
-                    break
-            if batch_size_dim == -1:
-                raise ValueError("Variable {0} must has a batch size dimension",
-                                 each_var.name)
            self.feed_lod_level.append(each_var.lod_level)
-            self.feed_shapes.append(shape)
+            self.feed_shapes.append(each_var.shape)
        self.place = place

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,11 +14,15 @@
 from __future__ import print_function
+import os
+import multiprocessing
 import numpy as np
 import contextlib
 import six
 from .framework import Program, default_main_program, Variable
 from . import core
+from . import compiler
+from .. import compat as cpt
 __all__ = ['Executor', 'global_scope', 'scope_guard']
@@ -204,20 +208,20 @@ def _fetch_var(name, scope=None, return_numpy=True):
    return tensor
-def _get_program_cache_key(feed, fetch_list):
+def _to_name_str(var):
-    feed_var_names = list(feed.keys())
+    if isinstance(var, Variable):
+        return var.desc.name()
+    elif isinstance(var, str):
+        return var
+    elif isinstance(var, six.string_types):
+        return str(var)
+    else:
+        raise TypeError(str(var) + " should be Variable or str")
-    def to_name_str(var):
-        if isinstance(var, Variable):
-            return var.desc.name()
-        elif isinstance(var, str):
-            return var
-        elif isinstance(var, six.string_types):
-            return str(var)
-        else:
-            raise TypeError(str(var) + " should be Variable or str")
-    fetch_var_names = list(map(to_name_str, fetch_list))
+def _get_program_cache_key(feed, fetch_list):
+    feed_var_names = list(feed.keys())
+    fetch_var_names = list(map(_to_name_str, fetch_list))
    return str(feed_var_names + fetch_var_names)
@@ -266,6 +270,29 @@ class Executor(object):
    But the global scope variables will be persistent through different runs.
    All of ops in program will be running in sequence.
+    Example:
+    .. code-block:: python
+        # First create the Executor.
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        # Run the startup program once and only once.
+        # Not need to optimize/compile the startup program.
+        exe.run(fluid.default_startup_program())
+        # Run the main program directly without compile.
+        loss, = exe.run(fluid.default_main_program(),
+                        feed=feed_dict,
+                        fetch_list=[loss.name])
+        # Or, compiled the program and run. See `CompiledProgram` for more detail.
+        compiled_prog = compiler.CompiledProgram(
+            fluid.default_main_program()).with_data_parallel(
+            loss_name=loss.name)
+        loss, = exe.run(compiled_prog,
+                        feed=feed_dict,
+                        fetch_list=[loss.name])
    Args:
        place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
@@ -275,11 +302,8 @@ class Executor(object):
    def __init__(self, place):
        self.place = place
-        p = core.Place()
-        p.set_place(place)
-        self.executor = core.Executor(p)
        self.program_caches = dict()
+        self.executor = None
        self._closed = False
    def _get_program_cache(self, program_cache_key):
@@ -361,6 +385,7 @@ class Executor(object):
        You can no long use this executor after calling this method.
        For the distributed training, this method would free the resource on PServers related to
        the current Trainer.
+        TODO(panyx0718): Why ParallelExecutor doesn't have close?
        Example:
            >>> cpu = core.CPUPlace()
@@ -368,10 +393,55 @@ class Executor(object):
            >>> ...
            >>> exe.close()
        """
-        if not self._closed:
+        if not self._closed and self.executor:
            self.executor.close()
            self._closed = True
+    def _run_parallel(self, scope, feed, fetch_list, fetch_var_name,
+                      return_numpy):
+        if isinstance(feed, dict):
+            feed_tensor_dict = dict()
+            for feed_name in feed:
+                feed_tensor = feed[feed_name]
+                if not isinstance(feed_tensor, core.LoDTensor):
+                    feed_tensor = core.LoDTensor()
+                    # always set to CPU place, since the tensor need to be splitted
+                    # it is fast in CPU
+                    feed_tensor.set(feed[feed_name], core.CPUPlace())
+                feed_tensor_dict[feed_name] = feed_tensor
+            self.executor.feed_and_split_tensor_into_local_scopes(
+                feed_tensor_dict)
+        elif isinstance(feed, list) or isinstance(feed, tuple):
+            if len(feed) != len(self._places):
+                raise ValueError(
+                    "Feed a list of tensor, the list should be the same size as places"
+                )
+            res = list()
+            for i, each in enumerate(feed):
+                if not isinstance(each, dict):
+                    raise TypeError(
+                        "Each element of feed list should be a dict")
+                res_dict = dict()
+                for feed_name in each:
+                    tensor = each[feed_name]
+                    if not isinstance(tensor, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(tensor, self._places[i])
+                        tensor = tmp
+                    res_dict[feed_name] = tensor
+                res.append(res_dict)
+            self.executor.feed_tensors_into_local_scopes(res)
+        fetch_var_names = list(map(_to_name_str, fetch_list))
+        self.executor.run(fetch_var_names, fetch_var_name)
+        arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
+        if return_numpy:
+            return as_numpy(arr)
+        return [arr[i] for i in range(len(arr))]
    def run(self,
            program=None,
            feed=None,
@@ -391,8 +461,9 @@ class Executor(object):
        operators in the program but not only the operators dependent by the fetch_list
        Args:
-            program(Program): the program that need to run, if not provied, then default_main_program will be used.
+            program(Program|CompiledProgram): the program that need to run,
-            feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData}
+                if not provided, then default_main_program will be used.
+            feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData}
            fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
            feed_var_name(str): the name for the input variable of feed Operator.
            fetch_var_name(str): the name for the output variable of fetch Operator.
@@ -428,14 +499,59 @@ class Executor(object):
        if self._closed:
            raise RuntimeError("Attempted to use a closed Executor")
+        if scope is None:
+            scope = global_scope()
+        if fetch_list is None:
+            fetch_list = []
+        compiled = isinstance(program, compiler.CompiledProgram)
+        # For backward compatibility, run directly.
+        if not compiled:
+            if not self.executor:
+                p = core.Place()
+                p.set_place(self.place)
+                self.executor = core.Executor(p)
+            return self._run(
+                program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+                scope=scope,
+                return_numpy=return_numpy,
+                use_program_cache=use_program_cache)
+        program._compile(scope, self.place)
+        self.executor = program._executor
+        if program._is_data_parallel:
+            return self._run_parallel(
+                scope=scope,
+                feed=feed,
+                fetch_list=fetch_list,
+                fetch_var_name=fetch_var_name,
+                return_numpy=return_numpy)
+        else:
+            # TODO(panyx0718): Can compile program to optimize executor
+            # performance.
+            return self._run(
+                program._program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+                scope=scope,
+                return_numpy=return_numpy,
+                use_program_cache=use_program_cache)
+    def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name,
+             scope, return_numpy, use_program_cache):
        if feed is None:
            feed = {}
        if not isinstance(feed, dict):
            raise TypeError(
                "feed requires dict as its Parameter. But you passed in %s" %
                (type(feed)))
-        if fetch_list is None:
-            fetch_list = []
        if program is None:
            program = default_main_program()
@@ -444,9 +560,6 @@ class Executor(object):
                "Executor requires Program as its Parameter. But you passed in %s"
                % (type(program)))
-        if scope is None:
-            scope = global_scope()
        cache_key = _get_program_cache_key(feed, fetch_list)
        if use_program_cache:
            cached_program = self._get_program_cache(cache_key)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -373,27 +373,21 @@ class Variable(object):
        self.stop_gradient = stop_gradient
        self.is_data = is_data
        if _in_imperative_mode():
-            self._ivar = core.VarBase()
+            self._ivar = kwargs.get("ivar", None)
+            if not self._ivar:
+                self._ivar = core.VarBase()
            self._ivar.desc = self.desc
            self._ivar.stop_gradient = stop_gradient
    def _numpy(self):
-        tensor = self._ivar.value.get_tensor()
+        tensor = self._ivar.value().get_tensor()
        return np.array(tensor)
    def _backward(self):
        self._ivar._run_backward()
    def _gradient(self):
-        return np.array(self._ivar._grad())
+        return np.array(self._ivar._grad_value())
-    @property
-    def _value(self):
-        return self._ivar.value
-    @_value.setter
-    def _value(self, v):
-        self._ivar.value = v
    def __str__(self):
        return self.to_string(True)

--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -45,7 +45,7 @@ def to_variable(value, block=None):
            name=None,
            shape=value.shape,
            dtype=value.dtype)
-        var = py_var._ivar.value
+        var = py_var._ivar.value()
        tensor = var.get_tensor()
        tensor.set(value, core.CPUPlace())
        return py_var

--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -20,10 +20,12 @@ from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.imperative import base
-__all__ = ['PyLayer']
+__all__ = ['Layer', 'PyLayer']
-class PyLayer(core.Layer):
+class Layer(core.Layer):
+    """Layers composed of operators."""
    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
        self._once_built = False
        self._dtype = dtype
@@ -37,8 +39,56 @@ class PyLayer(core.Layer):
            self._once_built = True
        outputs = self.forward(*inputs)
        return outputs
    def forward(self, *inputs):
        raise NotImplementedError
+    def backward(self, *inputs):
+        raise ValueError("Layer shouldn't implement backward")
+class PyLayer(core.PyLayer):
+    """Layers composed of user-defined python codes."""
+    def __init__(self):
+        super(PyLayer, self).__init__()
+    @staticmethod
+    def forward(*inputs):
+        raise NotImplementedError
+    @staticmethod
+    def backward(*douts):
+        raise NotImplementedError
+    @classmethod
+    def __call__(cls, *inputs):
+        tracer = framework._imperative_tracer()
+        block = framework.default_main_program().current_block()
+        ivar_inputs = [x._ivar for x in inputs]
+        if not hasattr(cls, 'forward_id'):
+            cls.forward_id = core.PyLayer.num_funcs() + 1
+            PyLayer.register_func(cls.forward_id, cls.forward)
+            cls.backward_id = core.PyLayer.num_funcs() + 1
+            PyLayer.register_func(cls.backward_id, cls.backward)
+        iop = core.OpBase()
+        iop.forward_id = cls.forward_id
+        iop.backward_id = cls.backward_id
+        block.ops.append(iop)
+        ivars = tracer.py_trace(iop, ivar_inputs, False)
+        # ivars = core.PyLayer.apply(cls.forward, inputs)
+        ret = []
+        for ivar in ivars:
+            tensor = ivar.value().get_tensor()
+            py_var = framework.Variable(
+                block,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                name=None,
+                shape=tensor.shape(),
+                dtype=tensor._dtype(),
+                ivar=ivar)
+            ret.append(py_var)
+        return ret
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -30,7 +30,7 @@ __all__ = [
 ]
-class Conv2D(layers.PyLayer):
+class Conv2D(layers.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
@@ -143,7 +143,7 @@ class Conv2D(layers.PyLayer):
        return self._helper.append_activation(pre_act)
-class Pool2D(layers.PyLayer):
+class Pool2D(layers.Layer):
    def __init__(self,
                 pool_size=-1,
                 pool_type="max",
@@ -205,7 +205,7 @@ class Pool2D(layers.PyLayer):
        return pool_out
-class FC(layers.PyLayer):
+class FC(layers.Layer):
    def __init__(self,
                 size,
                 param_attr=None,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -58,6 +58,7 @@ __all__ = [
    'adaptive_pool2d',
    'adaptive_pool3d',
    'batch_norm',
+    'data_norm',
    'beam_search_decode',
    'conv2d_transpose',
    'conv3d_transpose',
@@ -180,6 +181,7 @@ __all__ = [
    'lstm',
    'py_func',
    'psroi_pool',
+    'teacher_student_sigmoid_loss',
    'huber_loss',
 ]
@@ -2896,6 +2898,133 @@ def batch_norm(input,
    return helper.append_activation(batch_norm_out)
+def data_norm(input,
+              act=None,
+              epsilon=1e-05,
+              param_attr=None,
+              data_layout='NCHW',
+              in_place=False,
+              use_mkldnn=False,
+              name=None,
+              moving_mean_name=None,
+              moving_variance_name=None,
+              do_model_average_for_mean_and_var=False):
+    """
+    **Data Normalization Layer**
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+    :math:`input` is the input features over a mini-batch.
+    ..  math::
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        epsilon(float, Default 1e-05):
+        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+    Returns:
+        Variable: A tensor variable which is the result after applying data normalization on the input.
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(input=x, size=200, param_attr='fc1.w')
+            hidden2 = fluid.layers.data_norm(input=hidden1)
+    """
+    helper = LayerHelper('data_norm', **locals())
+    dtype = helper.input_dtype()
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+    param_shape = [channel_num]
+    batch_size_default = 1e4
+    batch_sum_default = 0.0
+    batch_square_sum_default = 1e4
+    if param_attr and isinstance(param_attr, dict):
+        batch_size_default = param_attr.get("batch_size", 1e4)
+        batch_sum_default = param_attr.get("batch_sum", 0.0)
+        batch_square_sum_default = param_attr.get("batch_square", 1e4)
+    # create parameter
+    batch_size = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_size',
+            initializer=Constant(value=float(batch_size_default)),
+            trainable=True),
+        shape=param_shape,
+        dtype=input.dtype)
+    batch_sum = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_sum',
+            initializer=Constant(value=float(batch_sum_default)),
+            trainable=True),
+        shape=param_shape,
+        dtype=input.dtype)
+    batch_square_sum = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_square_sum',
+            initializer=Constant(value=float(batch_square_sum_default)),
+            trainable=True),
+        shape=param_shape,
+        dtype=input.dtype)
+    means = helper.create_variable(dtype=dtype, stop_gradient=True)
+    scales = helper.create_variable(dtype=dtype, stop_gradient=True)
+    data_norm_out = input if in_place else helper.create_variable(dtype=dtype)
+    helper.append_op(
+        type="data_norm",
+        inputs={
+            "X": input,
+            "BatchSize": batch_size,
+            "BatchSum": batch_sum,
+            "BatchSquareSum": batch_square_sum
+        },
+        outputs={"Y": data_norm_out,
+                 "Means": means,
+                 "Scales": scales},
+        attrs={"epsilon": epsilon,
+               "use_mkldnn": use_mkldnn})
+    return helper.append_activation(data_norm_out)
 @templatedoc()
 def layer_norm(input,
               scale=True,
@@ -3064,9 +3193,9 @@ def group_norm(input,
        inputs['Bias'] = bias
    # create output
-    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
-    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
-    group_norm_out = helper.create_tmp_variable(dtype)
+    group_norm_out = helper.create_variable(dtype)
    helper.append_op(
        type="group_norm",
@@ -9264,6 +9393,47 @@ def log_loss(input, label, epsilon=1e-4, name=None):
    return loss
+def teacher_student_sigmoid_loss(input,
+                                 label,
+                                 soft_max_up_bound=15.0,
+                                 soft_max_lower_bound=-15.0):
+    """
+    **Teacher Student Log Loss Layer**
+    This layer accepts input predictions and target label and returns the
+    teacher_student loss.
+    .. math::
+        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator.
+        label (Variable|list):  the ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound 
+        soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
+    Returns:
+        Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss.
+    Examples:
+        .. code-block:: python
+          cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
+    """
+    helper = LayerHelper('teacher_student_sigmoid_loss', **locals())
+    out = helper.create_variable(dtype=input.dtype)
+    helper.append_op(
+        type='teacher_student_sigmoid_loss',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \
+                "soft_max_up_bound": float(soft_max_up_bound)})
+    return out
 def add_position_encoding(input, alpha, beta, name=None):
    """
    **Add Position Encoding Layer**

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -195,22 +195,18 @@ class Optimizer(object):
                            format(name, param.name))
        return self._accumulators[name][param.name]
-    def _create_optimization_pass(self,
+    def _create_optimization_pass(self, parameters_and_grads):
-                                  parameters_and_grads,
-                                  loss,
-                                  startup_program=None):
        """Add optimization operators to update gradients to variables.
        Args:
-          loss(Variable): the target that this optimization is for.
          parameters_and_grads(list(tuple(Variable, Variable))):
-          a list of (variable, gradient) pair to update.
+            a list of (variable, gradient) pair to update.
        Returns:
          return_op_list: a list of operators that will complete one step of
-          optimization. This will include parameter update ops, global step
+            optimization. This will include parameter update ops, global step
-          update ops and any other custom ops required by subclasses to manage
+            update ops and any other custom ops required by subclasses to manage
-          their internal state.
+            their internal state.
        """
        # This is a default implementation of create_optimization_pass that
        # can be shared by most optimizers. This implementation assumes that
@@ -219,37 +215,33 @@ class Optimizer(object):
        # _create_accumulators method if it needs to create accumulators
        # for parameters and extend _finish_update method to add custom ops.
-        # Create any accumulators
+        # Allways called under program_guard use global block as loss block
-        program = loss.block.program
+        global_block = framework.default_main_program().global_block()
-        self._dtype = loss.dtype
+        start = len(global_block.ops)
-        with program_guard(program, startup_program):
+        self.helper = LayerHelper(self.__class__.__name__)
-            global_block = framework.default_main_program().global_block()
+        self._create_accumulators(global_block,
-            start = len(global_block.ops)
+                                  [p[0] for p in parameters_and_grads])
-            self.helper = LayerHelper(self.__class__.__name__)
+        self._create_global_learning_rate()
-            self._create_accumulators(loss.block,
-                                      [p[0] for p in parameters_and_grads])
+        optimize_ops = []
-            self._create_global_learning_rate()
+        for param_and_grad in parameters_and_grads:
+            if param_and_grad[1] is None:
-            optimize_ops = []
+                continue
-            for param_and_grad in parameters_and_grads:
+            with param_and_grad[0].block.program._optimized_guard(
-                if param_and_grad[1] is None:
+                    param_and_grad), name_scope("optimizer"):
-                    continue
+                if param_and_grad[0].trainable is True:
-                with param_and_grad[0].block.program._optimized_guard(
+                    optimize_op = self._append_optimize_op(global_block,
-                        param_and_grad), name_scope("optimizer"):
+                                                           param_and_grad)
-                    if param_and_grad[0].trainable is True:
+                    optimize_ops.append(optimize_op)
-                        optimize_op = self._append_optimize_op(loss.block,
-                                                               param_and_grad)
+        # Get custom finish ops for subclasses
-                        optimize_ops.append(optimize_op)
+        # FIXME: Need to fix this once we figure out how to handle dependencies
+        self._finish_update(global_block, parameters_and_grads)
-            # Get custom finish ops for subclasses
-            # FIXME: Need to fix this once we figure out how to handle dependencies
+        end = len(global_block.ops)
-            self._finish_update(loss.block, parameters_and_grads)
+        return global_block._slice_ops(start, end)
-            end = len(global_block.ops)
+    def _process_distribute_lookuptable(self, param_grads):
-            return global_block._slice_ops(start, end)
-    def _process_distribute_lookuptable(self, param_grads, loss,
-                                        startup_program):
        """
        Because distribute lookup table only support SGD optimizer for now, not support
        other optimizer and regularization, so we should find the table parameter out,
@@ -259,7 +251,8 @@ class Optimizer(object):
        :param loss: the loss variable.
        :param startup_program: the startup program
        """
-        program = loss.block.program
+        program = framework.default_main_program()
+        global_block = framework.default_main_program().global_block()
        table_name = find_distributed_lookup_table(program)
        table_param = None
        table_grad = None
@@ -275,38 +268,121 @@ class Optimizer(object):
                new_param_grads.append((p, g))
        sgd_op = None
        if table_param is not None:
-            with program_guard(program, startup_program):
+            param_and_grad = [table_param, table_grad]
-                param_and_grad = [table_param, table_grad]
+            with table_param.block.program._optimized_guard(param_and_grad), \
-                with table_param.block.program._optimized_guard(param_and_grad), \
+                    framework.name_scope("optimizer"):
-                     framework.name_scope("optimizer"):
+                self._create_global_learning_rate()
-                    self._create_global_learning_rate()
+                # create the optimize op
-                    # create the optimize op
+                sgd_op = global_block.append_op(
-                    sgd_op = loss.block.append_op(
+                    type='sgd',
-                        type='sgd',
+                    inputs={
-                        inputs={
+                        "Param": table_param,
-                            "Param": table_param,
+                        "Grad": table_grad,
-                            "Grad": table_grad,
+                        "LearningRate": self._create_param_lr(param_and_grad)
-                            "LearningRate":
+                    },
-                            self._create_param_lr(param_and_grad)
+                    outputs={"ParamOut": param_and_grad[0]})
-                        },
-                        outputs={"ParamOut": param_and_grad[0]})
        return new_param_grads, (table_param, table_grad), sgd_op
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        First part of `minimize`, do auto-diff to append backward ops for
+        the current program.
+        Args:
+            loss (Variable): loss variable to run optimizations.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            parameter_list (list): list of Variables to update.
+            no_grad_set (set|None): set of Variables should be ignored.
+            callbacks (list|None): list of callables to run when appending backward
+                operator for one parameter.
+        Return:
+            list: list of (param, grad) pair, grad is the output of backward.
+        Examples:
+            See examples in `apply_gradients`.
+        """
+        if callbacks is None:
+            callbacks = [error_clip_callback]
+        else:
+            assert (isinstance(callbacks, list))
+            callbacks.append(error_clip_callback)
+        return append_backward(loss, parameter_list, no_grad_set, callbacks)
+    def apply_gradients(self, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+        Args:
+            params_grads (list): list of (param, grad) pair to do optimization.
+        Returns:
+            list: A list of operators appended to the current program.
+        Examples:
+            .. code-block:: python
+                loss = network()
+                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+                params_grads = optimizer.backward(loss)
+                # you may append operations for params_grads here
+                # ...
+                optimizer.apply_gradients(params_grads)
+        """
+        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+        params_grads, table_param_and_grad, table_optimize_op = \
+            self._process_distribute_lookuptable(params_grads)
+        params_grads = append_gradient_clip_ops(params_grads)
+        # Add regularization if any
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
+        optimize_ops = self._create_optimization_pass(params_grads)
+        if table_optimize_op is not None:
+            optimize_ops.append(table_optimize_op)
+            params_grads.append(table_param_and_grad)
+        return optimize_ops
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
-        """Add operations to minimize `loss` by updating `parameter_list`.
+        """
+        Add operations to minimize `loss` by updating `parameter_list`.
-        This method combines interface `append_backward()` and
+        This method combines interface `backward()` and
-        `create_optimization_pass()` into one.
+        `apply_gradients()` into one.
+        Args:
+            loss (Variable): loss variable to run optimizations.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            parameter_list (list): list of Variables to update.
+            no_grad_set (set|None): set of Variables should be ignored.
+        Returns:
+            tuple: (optimize_ops, params_grads) which are, list of operators appended;
+            and list of (param, grad) Variables pair for optimization.
        """
+        self._dtype = loss.dtype
+        program = loss.block.program
+        optimize_ops = []
        if imperative_base.enabled():
            if parameter_list is not None:
                params_grads = parameter_list
            else:
-                program = loss.block.program
                parameters = program.global_block().all_parameters()
                params_grads = []
                for param in parameters:
@@ -314,32 +390,16 @@ class Optimizer(object):
                    grad_var = Variable(
                        block=loss.block,
                        name=param._ivar._grad_name(),
-                        stop_gradient=True)
+                        stop_gradient=True,
-                    grad_var._value = param._ivar.grad_value
+                        ivar=param._ivar._grad_ivar())
                    params_grads.append((param, grad_var))
+            with program_guard(program, startup_program):
-            optimize_ops = self._create_optimization_pass(params_grads, loss,
+                optimize_ops = self._create_optimization_pass(params_grads)
-                                                          startup_program)
        else:
-            params_grads = append_backward(loss, parameter_list, no_grad_set,
+            with program_guard(program, startup_program):
-                                           [error_clip_callback])
+                params_grads = self.backward(loss, startup_program,
+                                             parameter_list, no_grad_set)
-            params_grads = sorted(params_grads, key=lambda x: x[0].name)
+                optimize_ops = self.apply_gradients(params_grads)
-            params_grads, table_param_and_grad, table_optimize_op = \
-                self._process_distribute_lookuptable(params_grads, loss, startup_program)
-            params_grads = append_gradient_clip_ops(params_grads)
-            # Add regularization if any
-            params_grads = append_regularization_ops(params_grads,
-                                                     self.regularization)
-            optimize_ops = self._create_optimization_pass(params_grads, loss,
-                                                          startup_program)
-            if table_optimize_op is not None:
-                optimize_ops.append(table_optimize_op)
-                params_grads.append(table_param_and_grad)
        return optimize_ops, params_grads

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -181,9 +181,8 @@ class ParallelExecutor(object):
        # step7: init ParallelExecutor
        self.executor = core.ParallelExecutor(
            places, persistable_vars, main.desc,
-            cpt.to_text(loss_name)
+            cpt.to_text(loss_name) if loss_name else six.u(''), scope,
-            if loss_name else six.u(''), scope, local_scopes, exec_strategy,
+            local_scopes, exec_strategy, build_strategy)
-            build_strategy, num_trainers, trainer_id)
        self.scope = scope
@@ -294,7 +293,7 @@ class ParallelExecutor(object):
                res.append(res_dict)
            self.executor.feed_tensors_into_local_scopes(res)
-        fetch_var_name = '@FETCHED_VAR_NAME@'
+        fetch_var_name = 'fetch'
        self.executor.run(fetch_list, fetch_var_name)
        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()

--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -30,6 +30,12 @@ class TestDataFeeder(unittest.TestCase):
        self.assertEqual(result['image'].recursive_sequence_lengths(), [])
        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
+        try:
+            result = feeder.feed([([0] * 783, [9]), ([1] * 783, [1])])
+            self.assertTrue(False)
+        except ValueError:
+            self.assertTrue(True)
    def test_lod_level_1_converter(self):
        # lod_level = 1
        # each sentence has a different number of words

--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -31,6 +31,7 @@ fluid.default_main_program().random_seed = 1
 class TestDistCTR2x2(TestDistRunnerBase):
    def get_model(self, batch_size=2):
        dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
        """ network definition """
        dnn_data = fluid.layers.data(
@@ -97,7 +98,14 @@ class TestDistCTR2x2(TestDistRunnerBase):
        inference_program = paddle.fluid.default_main_program().clone()
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+        regularization = None
+        use_l2_decay = bool(os.getenv('USE_L2_DECAY', 0))
+        if use_l2_decay:
+            regularization = fluid.regularizer.L2DecayRegularizer(
+                regularization_coeff=1e-1)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001,
+                                            regularization=regularization)
        sgd_optimizer.minimize(avg_cost)
        dataset = dist_ctr_reader.Dataset()

--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -235,7 +235,6 @@ class DistSeResneXt2x2(TestDistRunnerBase):
        bd = [step * e for e in epochs]
        base_lr = 0.1
-        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
        optimizer = fluid.optimizer.Momentum(

--- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp
+class TestNGRAPHMeanOp(TestMeanOp):
+    def setUp(self):
+        super(TestNGRAPHMeanOp, self).setUp()
+class TestNGRAPHFP16MeanOp(TestFP16MeanOp):
+    def setUp(self):
+        super(TestNGRAPHFP16MeanOp, self).setUp()
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows
+class TestNGRAPHScaleOp(TestScaleOp):
+    def init_dtype_type(self):
+        pass
+class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
+    def init_dtype_type(self):
+        pass
+class TestNGRAPHScaleFp16Op(TestScaleFp16Op):
+    def init_dtype_type(self):
+        pass
+class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows):
+    def init_dtype_type(self):
+        pass
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -19,6 +19,7 @@ import os
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid import compiler
 import time
 import numpy as np
 import math
@@ -44,15 +45,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                  optimizer=fluid.optimizer.Adam,
                                  use_fast_executor=False,
                                  enable_sequential_execution=False):
-        def run_executor(exe, feed, fetch_list, program=None):
+        def run_executor(exe, binary, feed, fetch_list):
-            if isinstance(exe, fluid.ParallelExecutor):
+            res = exe.run(binary, feed=feed, fetch_list=fetch_list)
-                res = exe.run(fetch_list=fetch_list, feed=feed)
-            elif isinstance(exe, fluid.Executor):
-                if program is None:
-                    program = fluid.default_main_program()
-                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
-            else:
-                raise ValueError('Unkown type exe')
            return res
        main = fluid.Program()
@@ -72,8 +66,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                fluid.memory_optimize(main)
            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            startup_exe = fluid.Executor(place)
+            exe = fluid.Executor(place)
-            startup_exe.run(startup)
+            exe.run(startup)
            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.allow_op_delay = allow_op_delay
            if use_fast_executor:
@@ -86,15 +80,13 @@ class TestParallelExecutorBase(unittest.TestCase):
            build_strategy.enable_sequential_execution = enable_sequential_execution
            if use_cuda and core.is_compiled_with_cuda():
                build_strategy.remove_unnecessary_lock = True
            if use_parallel_executor:
-                exe = fluid.ParallelExecutor(
+                binary = compiler.CompiledProgram(main).with_data_parallel(
-                    use_cuda,
                    loss_name=loss.name,
-                    exec_strategy=exec_strategy,
+                    build_strategy=build_strategy,
-                    build_strategy=build_strategy)
+                    exec_strategy=exec_strategy)
            else:
-                exe = fluid.Executor(place=place)
+                binary = compiler.CompiledProgram(main)
            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count(
@@ -102,13 +94,14 @@ class TestParallelExecutorBase(unittest.TestCase):
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
            begin = time.time()
            first_loss, = run_executor(
-                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+                exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
            for i in range(iter):
-                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
+                run_executor(
+                    exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
            last_loss, = run_executor(
-                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+                exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
            end = time.time()
            if batch_size is not None:

--- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
@@ -25,6 +25,15 @@ from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
 def conv2d_forward_refer(input, filter, group, conv_param):
    out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
                                                          conv_param)
+    size = [in_n, out_c, out_h, out_w]
+    return format_reorder(out, size)
+def format_reorder(out, size):
+    in_n = size[0]
+    out_h = size[2]
+    out_w = size[3]
+    out_c = size[1]
    out_tmp = np.zeros((in_n, out_h, out_w, out_c))
    for n in range(in_n):
        for i in range(out_h):
@@ -48,6 +57,7 @@ class TestConv2dInt8Op(TestConv2dOp):
        self.init_dilation()
        self.init_test_case()
        self.init_fuse_relu()
+        self.init_fuse_residual()
        self.init_data_type()
        conv2d_param = {
@@ -79,11 +89,24 @@ class TestConv2dInt8Op(TestConv2dOp):
                np.round((input_shift) * self.scale_in).astype(np.int32),
                filter_int, self.groups,
                conv2d_param).astype(np.float32) * scale_output_shift
-            if self.fuse_relu:
+            if self.fuse_residual:
-                output = np.maximum(np.round(output1 - output2),
+                input_residual = np.random.randint(
-                                    0).astype(self.dsttype)
+                    -5, 5, self.input_residual_size).astype(self.srctype)
+                output_tmp = np.round(output1 - output2 + format_reorder(
+                    input_residual, self.input_residual_size).astype(
+                        self.srctype) * (self.scale_out / self.scale_in_eltwise
+                                         ))
+                if self.fuse_relu:
+                    output = np.maximum(output_tmp, 0).astype(self.dsttype)
+                else:
+                    output = output_tmp.astype(self.dsttype)
            else:
-                output = np.round(output1 - output2).astype(self.dsttype)
+                if self.fuse_relu:
+                    output = np.maximum(np.round(output1 - output2),
+                                        0).astype(self.dsttype)
+                else:
+                    output = np.round(output1 - output2).astype(self.dsttype)
        else:
            filter_int = np.round(filter *
                                  self.scale_weights[0]).astype(np.int32)
@@ -92,21 +115,35 @@ class TestConv2dInt8Op(TestConv2dOp):
            output1 = conv2d_forward_refer(
                input.astype(np.int32), filter_int, self.groups,
                conv2d_param).astype(np.float32)
-            if self.fuse_relu:
+            if self.fuse_residual:
-                output = np.maximum(
+                input_residual = np.random.randint(
-                    np.round(output1 * (self.scale_out / (
+                    0, 10, self.input_residual_size).astype(self.srctype)
-                        self.scale_in * self.scale_weights[0]))),
+                output_tmp = np.round(output1 * (self.scale_out / (
-                    0).astype(self.dsttype)
+                    self.scale_in * self.scale_weights[0])) + format_reorder(
+                        input_residual, self.input_residual_size).astype(
+                            np.int32) * (self.scale_out / self.scale_in_eltwise
+                                         ))
+                output_tmp2 = np.round(output1 * (
+                    self.scale_out / (self.scale_in * self.scale_weights[0])))
+                if self.fuse_relu:
+                    output = np.maximum(output_tmp, 0).astype(self.dsttype)
+                else:
+                    output = output_tmp.astype(self.dsttype)
            else:
-                output = np.round(output1 * (self.scale_out / (
+                if self.fuse_relu:
-                    self.scale_in *
+                    output = np.maximum(output_tmp2, 0).astype(self.dsttype)
-                    self.scale_weights[0]))).astype(self.dsttype)
+                else:
+                    output = output_tmp2.astype(self.dsttype)
        self.inputs = {
            'Input':
            OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)),
            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
        }
+        if self.fuse_residual:
+            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
+                input_residual)
        self.attrs = {
            'strides': self.stride,
            'paddings': self.pad,
@@ -119,7 +156,9 @@ class TestConv2dInt8Op(TestConv2dOp):
            'Scale_in': self.scale_in,
            'Scale_out': self.scale_out,
            'Scale_weights': self.scale_weights,
-            'fuse_relu': self.fuse_relu
+            'Scale_in_eltwise': self.scale_in_eltwise,
+            'fuse_relu': self.fuse_relu,
+            'fuse_residual_connection': self.fuse_residual
        }
        self.outputs = {'Output': output}
@@ -137,11 +176,14 @@ class TestConv2dInt8Op(TestConv2dOp):
    def init_test_case(self):
        TestConv2dOp.init_test_case(self)
+        self.input_size = [1, 1, 5, 5]  # NCHW
        f_c = self.input_size[1] // self.groups
-        self.filter_size = [1, f_c, 3, 3]
+        self.input_residual_size = [1, 2, 3, 3]
+        self.filter_size = [2, f_c, 3, 3]
        self.scale_in = 1.0
        self.scale_out = 0.5
        self.scale_weights = [10.0]
+        self.scale_in_eltwise = 0.6
    def init_data_type(self):
        self.srctype = np.uint8
@@ -150,8 +192,11 @@ class TestConv2dInt8Op(TestConv2dOp):
    def init_fuse_relu(self):
        self.fuse_relu = True
+    def init_fuse_residual(self):
+        self.fuse_residual = True
-#--------------------test conv2d u8 in and u8 out--------------------
+#--------------------test conv2d u8 in and u8 out with residual fuse--------------------
 class TestConv2d(TestConv2dInt8Op):
@@ -159,18 +204,21 @@ class TestConv2d(TestConv2dInt8Op):
        self.pad = [0, 0]
        self.stride = [1, 1]
        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.input_residual_size = [2, 6, 3, 3]
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] // self.groups
        self.filter_size = [6, f_c, 3, 3]
        self.scale_in = 1.0
        self.scale_out = 0.5
        self.scale_weights = [10.0]
+        self.scale_in_eltwise = 0.6
 class TestWithPad(TestConv2d):
    def init_test_case(self):
        TestConv2d.init_test_case(self)
        self.pad = [1, 1]
+        self.input_residual_size = [2, 6, 5, 5]
 class TestWithGroup(TestConv2d):
@@ -183,12 +231,14 @@ class TestWithStride(TestConv2dInt8Op):
        self.pad = [1, 1]
        self.stride = [2, 2]
        self.input_size = [2, 3, 6, 6]
+        self.input_residual_size = [2, 6, 3, 3]
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] // self.groups
        self.filter_size = [6, f_c, 3, 3]
        self.scale_in = 1.0
        self.scale_out = 0.8
        self.scale_weights = [10.0]
+        self.scale_in_eltwise = 0.5
 class TestWith1x1(TestConv2dInt8Op):
@@ -196,12 +246,14 @@ class TestWith1x1(TestConv2dInt8Op):
        self.pad = [0, 0]
        self.stride = [1, 1]
        self.input_size = [1, 3, 5, 5]
+        self.input_residual_size = [1, 6, 5, 5]
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] // self.groups
        self.filter_size = [6, f_c, 1, 1]
        self.scale_in = 1.0
        self.scale_out = 0.5
        self.scale_weights = [12.0]
+        self.scale_in_eltwise = 0.5
 class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
@@ -209,24 +261,29 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
        self.pad = [0, 0]
        self.stride = [1, 1]
        self.input_size = [2, 3, 1, 1]
+        self.input_residual_size = [2, 6, 1, 1]
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] // self.groups
        self.filter_size = [6, f_c, 1, 1]
        self.scale_in = 1.0
        self.scale_out = 0.5
        self.scale_weights = [10.0]
+        self.scale_in_eltwise = 0.8
    def init_group(self):
        self.groups = 3
-def init_data_type_with_fusion(self, input_dt, fuse_relu):
+def init_data_type_with_fusion(self, input_dt, fuse_relu, fuse_residual):
    self.srctype = input_dt
    self.dsttype = np.uint8 if fuse_relu else np.int8
    def init_fuse_relu(self):
        self.fuse_relu = fuse_relu
+    def init_fuse_residual(self):
+        self.fuse_residual = fuse_residual
 def create_test_int8_class(parent):
@@ -234,29 +291,68 @@ def create_test_int8_class(parent):
    class TestS8U8Case(parent):
        def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, True)
+            init_data_type_with_fusion(self, np.int8, True, False)
    #--------------------test conv2d s8 in and s8 out--------------------
    class TestS8S8Case(parent):
        def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, False)
+            init_data_type_with_fusion(self, np.int8, False, False)
    #--------------------test conv2d u8 in and s8 out--------------------
    class TestU8S8Case(parent):
        def init_data_type(self):
-            init_data_type_with_fusion(self, np.uint8, False)
+            init_data_type_with_fusion(self, np.uint8, False, False)
+    #--------------------test conv2d u8 in and u8 out without residual fuse--------------------
+    class TestU8U8Case(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.uint8, True, False)
-    cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1")
+    #--------------------test conv2d s8 in and u8 out with residual fuse--------------------
-    cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
-    cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
+    class TestS8U8ResCase(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.int8, True, True)
+    #--------------------test conv2d s8 in and s8 out with residual fuse--------------------
+    class TestS8S8ResCase(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.int8, False, True)
+    #--------------------test conv2d u8 in and s8 out with residual fuse--------------------
+    class TestU8S8ResCase(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.uint8, False, True)
+    cls_name_s8u8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "1")
+    cls_name_s8s8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "0")
+    cls_name_u8s8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "0")
+    cls_name_u8u8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "1")
+    cls_name_s8u8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
+                                                            "1", "1")
+    cls_name_s8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
+                                                            "0", "1")
+    cls_name_u8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
+                                                            "0", "1")
    TestS8U8Case.__name__ = cls_name_s8u8
    TestS8S8Case.__name__ = cls_name_s8s8
    TestU8S8Case.__name__ = cls_name_u8s8
+    TestU8U8Case.__name__ = cls_name_u8u8
+    TestS8U8ResCase.__name__ = cls_name_s8u8_re_1
+    TestS8S8ResCase.__name__ = cls_name_s8s8_re_1
+    TestU8S8ResCase.__name__ = cls_name_u8s8_re_1
    globals()[cls_name_s8u8] = TestS8U8Case
    globals()[cls_name_s8s8] = TestS8S8Case
    globals()[cls_name_u8s8] = TestU8S8Case
+    globals()[cls_name_u8u8] = TestU8U8Case
+    globals()[cls_name_s8u8_re_1] = TestS8U8ResCase
+    globals()[cls_name_s8s8_re_1] = TestS8S8ResCase
+    globals()[cls_name_u8s8_re_1] = TestU8S8ResCase
 create_test_int8_class(TestConv2dInt8Op)

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -26,6 +26,7 @@ import pickle
 import numpy as np
 import paddle.fluid as fluid
+from paddle.fluid import compiler
 RUN_STEP = 10
 DEFAULT_BATCH_SIZE = 2
@@ -104,8 +105,8 @@ class TestDistRunnerBase(object):
        else:
            place = fluid.CPUPlace()
-        startup_exe = fluid.Executor(place)
+        exe = fluid.Executor(place)
-        startup_exe.run(fluid.default_startup_program())
+        exe.run(fluid.default_startup_program())
        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
@@ -125,19 +126,16 @@ class TestDistRunnerBase(object):
            mypass.set_int("num_repeats", args.batch_merge_repeat)
        if args.update_method == "nccl2":
-            num_trainers = len(args.endpoints.split(","))
+            build_stra.num_trainers = len(args.endpoints.split(","))
-            trainer_id = args.trainer_id
+            build_stra.trainer_id = args.trainer_id
        else:
-            num_trainers = 1
+            build_stra.num_trainers = 1
-            trainer_id = 0
+            build_stra.trainer_id = 0
-        exe = fluid.ParallelExecutor(
+        binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
-            args.use_cuda,
            loss_name=avg_cost.name,
-            exec_strategy=strategy,
            build_strategy=build_stra,
-            num_trainers=num_trainers,
+            exec_strategy=strategy)
-            trainer_id=trainer_id)
        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
@@ -160,7 +158,8 @@ class TestDistRunnerBase(object):
        out_losses = []
        for _ in six.moves.xrange(RUN_STEP):
-            loss, = exe.run(fetch_list=[avg_cost.name],
+            loss, = exe.run(binary,
+                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
        if six.PY2:

--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@@ -18,7 +18,6 @@ import unittest
 from test_dist_base import TestDistBase
-# FIXME(tangwei): sum op can not handle when inputs is empty.
 class TestDistCTR2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
@@ -28,5 +27,19 @@ class TestDistCTR2x2(TestDistBase):
        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
+class TestDistCTRWithL2Decay2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+    def test_dist_ctr(self):
+        need_envs = {"USE_L2_DECAY": "1"}
+        self.check_with_place(
+            "dist_ctr.py",
+            delta=1e-7,
+            check_error_log=False,
+            need_envs=need_envs)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_reorder_lod_tensor import convert_to_offset
+from test_seq_pool import compute_seqpool_sum, compute_seqpool_avg, compute_seqpool_sqrt
+class TestFusionSeqPoolConcatOp(OpTest):
+    def setUp(self):
+        self.w = 11
+        self.lods = [[[2, 3, 5]], [[1, 5, 2]]]
+        self.set_conf()
+        self.set_pooltype()
+        self.op_type = 'fusion_seqpool_concat'
+        self.axis = 1
+        bs = len(self.lods[0][0])
+        inputs = []
+        outs = []
+        i = 0
+        for lod in self.lods:
+            assert bs == len(lod[0]), 'All lod size should be equal'
+            x = np.random.uniform(0.1, 1,
+                                  [sum(lod[0]), self.w]).astype('float32')
+            offset = convert_to_offset(lod)
+            out = np.zeros((bs, self.w)).astype('float32')
+            if self.pooltype == "SUM":
+                compute_seqpool_sum(x, offset, out)
+            elif self.pooltype == "AVERAGE":
+                compute_seqpool_avg(x, offset, out)
+            elif self.pooltype == "SQRT":
+                compute_seqpool_sqrt(x, offset, out)
+            else:
+                raise Exception("Unsupported pool type!")
+            inputs.append(('x_{0}'.format(i), (x, lod)))
+            outs.append(out)
+            i = i + 1
+        self.inputs = {'X': inputs}
+        self.outputs = {'Out': np.concatenate(outs, axis=self.axis)}
+        self.attrs = {
+            'pooltype': self.pooltype,
+            'axis': self.axis,
+        }
+    def set_pooltype(self):
+        self.pooltype = "SUM"
+    def set_conf(self):
+        pass
+    def test_check_output(self):
+        self.check_output()
+class TestFusionSeqPoolConcatOpCase1(TestFusionSeqPoolConcatOp):
+    def set_conf(self):
+        self.lods = [[[1]]]
+class TestFusionSeqPoolConcatOpCase2(TestFusionSeqPoolConcatOp):
+    def set_conf(self):
+        self.lods = [[[1]], [[1]], [[1]]]
+class TestFusionSeqPoolConcatOpCase3(TestFusionSeqPoolConcatOp):
+    def set_conf(self):
+        self.lods = [[[1, 3, 4, 6]]]
+        self.w = 10
+class TestFusionSeqPoolConcatOpCase4(TestFusionSeqPoolConcatOp):
+    def set_conf(self):
+        self.lods = [[[2, 13, 4]], [[1, 1, 1]], [[5, 3, 1]], [[9, 10, 3]]]
+        self.w = 3
+## test avg pool and sqrt
+def create_test_avg_sqrt_class(parent):
+    class TestSeqPoolAvgCase(parent):
+        def set_pooltype(self):
+            self.pooltype = "AVERAGE"
+    class TestSeqPoolSqrtCase(parent):
+        def set_pooltype(self):
+            self.pooltype = "SQRT"
+    cls_name_avg = "{0}_{1}".format(parent.__name__, "avg")
+    cls_name_sqrt = "{0}_{1}".format(parent.__name__, "sqrt")
+    TestSeqPoolAvgCase.__name__ = cls_name_avg
+    TestSeqPoolSqrtCase.__name__ = cls_name_sqrt
+    globals()[cls_name_avg] = TestSeqPoolAvgCase
+    globals()[cls_name_sqrt] = TestSeqPoolSqrtCase
+create_test_avg_sqrt_class(TestFusionSeqPoolConcatOp)
+create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase1)
+create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase2)
+create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase3)
+create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase4)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -15,6 +15,7 @@
 import contextlib
 import unittest
 import numpy as np
+import sys
 import paddle.fluid as fluid
 from paddle.fluid import core
@@ -22,7 +23,7 @@ from paddle.fluid.imperative.nn import FC
 from test_imperative_base import new_program_scope
-class MyLayer(fluid.imperative.PyLayer):
+class MyLayer(fluid.imperative.Layer):
    def __init__(self):
        super(MyLayer, self).__init__()
@@ -34,7 +35,35 @@ class MyLayer(fluid.imperative.PyLayer):
        return [x]
-class MLP(fluid.imperative.PyLayer):
+class MyPyLayer(fluid.imperative.PyLayer):
+    def __init__(self):
+        super(MyPyLayer, self).__init__()
+    @staticmethod
+    def forward(inputs):
+        sys.stderr.write('before forward\n')
+        ret = np.tanh(inputs[0])
+        sys.stderr.write('after forward: %s\n' % ret)
+        tensor = core.LoDTensor()
+        tensor.set(ret, core.CPUPlace())
+        return tuple([tensor])
+    @staticmethod
+    def backward(inputs):
+        sys.stderr.write('calling into backward: %s\n' % str(inputs))
+        inp, out, dout = inputs
+        inp = np.array(inp)
+        out = np.array(out)
+        dout = np.array(dout)
+        sys.stderr.write('calling into backward: %s, %s, %s\n' %
+                         (inp, out, dout))
+        ret = np.array(dout) * (1 - np.square(np.array(out)))
+        tensor = core.LoDTensor()
+        tensor.set(ret, core.CPUPlace())
+        return tuple([tensor])
+class MLP(fluid.imperative.Layer):
    def __init__(self):
        super(MLP, self).__init__()
        self._fc1 = FC(3,
@@ -56,9 +85,77 @@ class TestImperative(unittest.TestCase):
        with fluid.imperative.guard():
            cl = core.Layer()
            cl.forward([])
-            l = fluid.imperative.PyLayer()
+            l = fluid.imperative.Layer()
            self.assertRaises(NotImplementedError, l.forward, [])
+    def test_pylayer_func_id(self):
+        with fluid.imperative.guard():
+            class PyLayer1(fluid.imperative.PyLayer):
+                def __init__(self):
+                    super(PyLayer1, self).__init__()
+                @staticmethod
+                def forward(input):
+                    return input
+                @staticmethod
+                def backward(input):
+                    return input
+            class PyLayer2(fluid.imperative.PyLayer):
+                def __init__(self):
+                    super(PyLayer2, self).__init__()
+                @staticmethod
+                def forward(input):
+                    return input
+                @staticmethod
+                def backward(input):
+                    return input
+            py_layer_1 = PyLayer1()
+            py_layer_2 = PyLayer2()
+            py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2])))
+            py_layer_2(fluid.imperative.base.to_variable(np.ones([2, 2])))
+            id = py_layer_1.forward_id
+            self.assertGreater(id, 0)
+            self.assertEqual(py_layer_1.backward_id, id + 1)
+            self.assertEqual(py_layer_2.forward_id, id + 2)
+            self.assertEqual(py_layer_2.backward_id, id + 3)
+            py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2])))
+            self.assertEqual(py_layer_1.forward_id, id)
+    def test_pylayer(self):
+        np_inp = np.ones([2, 2], np.float32)
+        with fluid.imperative.guard():
+            my_py_layer = MyPyLayer()
+            var_inp = fluid.imperative.base.to_variable(np_inp)
+            outs = my_py_layer(var_inp)
+            dy_out = np.sum(outs[0]._numpy())
+            outs[0]._backward()
+            dy_grad = var_inp._gradient()
+        with new_program_scope():
+            inp = fluid.layers.data(
+                name="inp", shape=[2, 2], append_batch_size=False)
+            # TODO(panyx0718): Paddle doesn't diff against data `inp`.
+            x1 = inp * 1
+            # TODO(panyx0718): If reduce_sum is skipped, the result is wrong.
+            x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
+            param_grads = fluid.backward.append_backward(
+                x, parameter_list=[x1.name])[0]
+            exe = fluid.Executor(fluid.CPUPlace())
+            static_out, static_grad = exe.run(
+                feed={inp.name: np_inp},
+                fetch_list=[x.name, param_grads[1].name])
+        self.assertTrue(np.allclose(dy_out, static_out))
+        self.assertTrue(np.allclose(dy_grad, static_grad))
    def test_layer_in_out(self):
        np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
        with fluid.imperative.guard():

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -26,7 +26,7 @@ from paddle.fluid.imperative.base import to_variable
 from test_imperative_base import new_program_scope
-class SimpleImgConvPool(fluid.imperative.PyLayer):
+class SimpleImgConvPool(fluid.imperative.Layer):
    def __init__(self,
                 num_channels,
                 num_filters,
@@ -72,7 +72,7 @@ class SimpleImgConvPool(fluid.imperative.PyLayer):
        return x
-class MNIST(fluid.imperative.PyLayer):
+class MNIST(fluid.imperative.Layer):
    def __init__(self, param_attr=None, bias_attr=None):
        super(MNIST, self).__init__()
@@ -105,7 +105,6 @@ class TestImperativeMnist(unittest.TestCase):
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
-            #  mnist = Conv2D(1, 20, 5)
            mnist = MNIST()
            sgd = SGDOptimizer(learning_rate=1e-3)
            train_reader = paddle.batch(
@@ -126,16 +125,17 @@ class TestImperativeMnist(unittest.TestCase):
                label._stop_gradient = True
                cost = mnist(img)
-                loss = fluid.layers.reduce_mean(cost)
+                loss = fluid.layers.cross_entropy(cost, label)
-                dy_out = loss._numpy()
+                avg_loss = fluid.layers.mean(loss)
+                dy_out = avg_loss._numpy()
                if batch_id == 0:
                    for param in fluid.default_main_program().global_block(
                    ).all_parameters():
                        dy_param_init_value[param.name] = param._numpy()
-                loss._backward()
+                avg_loss._backward()
-                sgd.minimize(loss)
+                sgd.minimize(avg_loss)
                dy_param_value = {}
                for param in fluid.default_main_program().global_block(
                ).all_parameters():
@@ -147,7 +147,6 @@ class TestImperativeMnist(unittest.TestCase):
            exe = fluid.Executor(fluid.CPUPlace())
-            #  mnist = Conv2D(1, 20, 5)
            mnist = MNIST()
            sgd = SGDOptimizer(learning_rate=1e-3)
            train_reader = paddle.batch(
@@ -157,8 +156,9 @@ class TestImperativeMnist(unittest.TestCase):
                name='pixel', shape=[1, 28, 28], dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            cost = mnist(img)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = fluid.layers.cross_entropy(cost, label)
-            sgd.minimize(loss)
+            avg_loss = fluid.layers.mean(loss)
+            sgd.minimize(avg_loss)
            # initialize params and fetch them
            static_param_init_value = {}
@@ -182,7 +182,7 @@ class TestImperativeMnist(unittest.TestCase):
                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
                    [128, 1])
-                fetch_list = [loss.name]
+                fetch_list = [avg_loss.name]
                fetch_list.extend(static_param_name_list)
                out = exe.run(fluid.default_main_program(),
                              feed={"pixel": x_data,

--- a/python/paddle/fluid/tests/unittests/test_ir_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_graph.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import six
+from paddle import fluid
+class TestIRGraph(unittest.TestCase):
+    """
+    TODO(fc500110): `resolve_hazard` api will be tested when it can be used.
+    """
+    def test_nodes(self):
+        graph = build_graph()
+        self.assertTrue(
+            {node.name()
+             for node in graph.nodes()} == {"x1", "x2", "out", "sum"})
+    def test_has_set_get(self):
+        graph = build_graph()
+        for attr_name in ["int", "float", "string"]:
+            self.assertFalse(graph.has(attr_name))
+        graph.set("int", 1)
+        graph.set("float", 0.5)
+        graph.set("string", "string")
+        for attr_name in ["int", "float", "string"]:
+            self.assertTrue(graph.has(attr_name))
+        self.assertTrue(graph.get_int("int") == 1)
+        self.assertTrue(graph.get_float("float") == 0.5)
+        self.assertTrue(graph.get_string("string") == "string")
+    def test_erase(self):
+        graph = build_graph()
+        graph.set("test", 0)
+        self.assertTrue(graph.has("test"))
+        graph.erase("test")
+        self.assertFalse(graph.has("test"))
+    def test_create_var_node(self):
+        prog = fluid.core.ProgramDesc()
+        block = prog.block(0)
+        shape = [10, 20]
+        x1 = block.var(six.b("x1"))
+        x1.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
+        x1.set_shape(shape)
+        graph = fluid.core.Graph(prog)
+        node = graph.create_var_node(x1)
+        self.assertTrue(node.node_type() == fluid.core.Node.Type.Variable)
+    def test_create_op_node(self):
+        prog = fluid.core.ProgramDesc()
+        block = prog.block(0)
+        sum_op_desc = block.append_op()
+        graph = fluid.core.Graph(prog)
+        node = graph.create_op_node(sum_op_desc)
+        self.assertTrue(node.node_type() == fluid.core.Node.Type.Operation)
+    def test_create_control_dep_var(self):
+        graph = build_graph()
+        name = "__control_var@{}".format(len(graph.nodes()))
+        node = graph.create_control_dep_var()
+        self.assertTrue(node.name() == name)
+    def test_create_empty_node(self):
+        prog = fluid.core.ProgramDesc()
+        graph = fluid.core.Graph(prog)
+        n1 = graph.create_empty_node('x', fluid.core.Node.Type.Operation)
+        self.assertTrue(n1.name() == 'x')
+        n2 = graph.create_empty_node('y', fluid.core.Node.Type.Variable)
+        self.assertTrue(n2.name() == 'y')
+    def test_release_nodes(self):
+        graph = build_graph()
+        nodes = graph.release_nodes()
+        self.assertTrue(len(graph.nodes()) == 0)
+        self.assertTrue({node.name()
+                         for node in nodes} == {"x1", "x2", "out", "sum"})
+    def test_remove_node(self):
+        graph = build_graph()
+        nodes = graph.nodes()
+        for node in nodes:
+            if node.name() == "sum":
+                break
+        self.assertTrue({node.name()
+                         for node in nodes} == {"x1", "x2", "out", "sum"})
+        nodes.remove(node)
+        self.assertTrue({node.name() for node in nodes} == {"x1", "x2", "out"})
+    def test_retrieve_node(self):
+        graph = build_graph()
+        nodes = []
+        for i in range(len(graph.nodes())):
+            nodes.append(graph.retrieve_node(i))
+        for node in nodes:
+            self.assertTrue(node in graph.nodes())
+    def resolve_hazard(self):
+        pass
+def build_graph():
+    prog = fluid.core.ProgramDesc()
+    block = prog.block(0)
+    shape = [10, 20]
+    # prepare input/output
+    x1 = block.var(six.b("x1"))
+    x1.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
+    x1.set_shape(shape)
+    x2 = block.var(six.b("x2"))
+    x2.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
+    x2.set_shape(shape)
+    out = block.var(six.b("out"))
+    out.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
+    sum_op_desc = block.append_op()
+    sum_op_desc.set_type("sum")
+    sum_op_desc.set_input("X", ["x1", "x2"])
+    sum_op_desc.set_output("Out", ["out"])
+    sum_op_desc.check_attrs()
+    sum_op_desc.infer_shape(block)
+    graph = fluid.core.Graph(prog)
+    return graph
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -61,6 +61,48 @@ class TestOptimizer(unittest.TestCase):
        self.assertEqual([op.type for op in opts], ["sgd"])
+class TestOptimizerBackwardApplygrad(unittest.TestCase):
+    def test_sgd_optimizer(self):
+        def check_sgd_optimizer(optimizer_attr):
+            init_program = framework.Program()
+            program = framework.Program()
+            block = program.global_block()
+            mul_x = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="mul.x",
+                optimize_attr=optimizer_attr)
+            mul_y = block.create_var(
+                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+            mul_out = block.create_var(
+                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+            mean_out = block.create_var(
+                dtype="float32", shape=[1], lod_level=0, name="mean.out")
+            block.append_op(
+                type="mul",
+                inputs={"X": mul_x,
+                        "Y": mul_y},
+                outputs={"Out": mul_out},
+                attrs={"x_num_col_dims": 1})
+            block.append_op(
+                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
+            with framework.program_guard(program, init_program):
+                p_g = sgd_optimizer.backward(mean_out)
+                opts = sgd_optimizer.apply_gradients(p_g)
+            return opts
+        opts = check_sgd_optimizer({'learning_rate': 1.1})
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "sgd"])
+        opts = check_sgd_optimizer({'learning_rate': 1.0})
+        self.assertEqual(len(opts), 1)
+        self.assertEqual([op.type for op in opts], ["sgd"])
 class TestMomentumOptimizer(unittest.TestCase):
    class MockMomentum(optimizer.MomentumOptimizer):
        def get_accumulators(self):
@@ -99,8 +141,8 @@ class TestMomentumOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer._create_optimization_pass(
+        with framework.program_guard(program, init_program):
-            params_grads, mul_out, init_program)
+            opts = momentum_optimizer.apply_gradients(params_grads)
        self.assertEqual(len(opts), 3)
        sgd_op = opts[-1]
        self.assertEqual([op.type for op in opts],
@@ -153,8 +195,8 @@ class TestMomentumOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer._create_optimization_pass(
+        with framework.program_guard(program, init_program):
-            params_grads, mul_out, init_program)
+            opts = momentum_optimizer.apply_gradients(params_grads)
        self.assertEqual(len(opts), 3)
        sgd_op = opts[-1]
        self.assertEqual([op.type for op in opts],
@@ -216,8 +258,8 @@ class TestAdagradOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
-        opts = adagrad_optimizer._create_optimization_pass(
+        with framework.program_guard(program, init_program):
-            params_grads, mul_out, init_program)
+            opts = adagrad_optimizer.apply_gradients(params_grads)
        self.assertEqual(len(opts), 3)
        self.assertEqual([op.type for op in opts],
                         ["fill_constant", "elementwise_mul", "adagrad"])
@@ -280,8 +322,8 @@ class TestAdamOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
-        opts = adam_optimizer._create_optimization_pass(params_grads, mul_out,
+        with framework.program_guard(program, init_program):
-                                                        init_program)
+            opts = adam_optimizer.apply_gradients(params_grads)
        self.assertEqual(len(opts), 5)
        self.assertEqual(
            [op.type for op in opts],
@@ -347,8 +389,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
-        opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out,
+        with framework.program_guard(program, init_program):
-                                                          init_program)
+            opts = adamax_optimizer.apply_gradients(params_grads)
        self.assertEqual(len(opts), 4)
        self.assertEqual(
            [op.type for op in opts],
@@ -411,8 +453,8 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
-        opts = decayed_adagrad_optimizer._create_optimization_pass(
+        with framework.program_guard(program, init_program):
-            params_grads, mul_out, init_program)
+            opts = decayed_adagrad_optimizer.apply_gradients(params_grads)
        self.assertEqual(len(opts), 3)
        self.assertEqual(
            [op.type for op in opts],
@@ -477,8 +519,8 @@ class TestFtrlOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
-        opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out,
+        with framework.program_guard(program, init_program):
-                                                        init_program)
+            opts = ftrl_optimizer.apply_gradients(params_grads)
        self.assertEqual(len(opts), 3)
        self.assertEqual([op.type for op in opts],
                         ["fill_constant", "elementwise_mul", "ftrl"])

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -74,7 +74,11 @@ class TestMNIST(TestParallelExecutorBase):
        label = np.ones(shape=[32, 1], dtype='int64')
        return img, label
-    def _compare_reduce_and_allreduce(self, model, use_cuda):
+    def _compare_reduce_and_allreduce(self,
+                                      model,
+                                      use_cuda,
+                                      delta1=1e-6,
+                                      delta2=1e-4):
        if use_cuda and not core.is_compiled_with_cuda():
            return
@@ -95,9 +99,9 @@ class TestMNIST(TestParallelExecutorBase):
            use_reduce=True)
        for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
+            self.assertAlmostEqual(loss[0], loss[1], delta=delta1)
        for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEqual(loss[0], loss[1], delta=1e-4)
+            self.assertAlmostEqual(loss[0], loss[1], delta=delta2)
    # simple_fc
    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
@@ -174,8 +178,9 @@ class TestMNIST(TestParallelExecutorBase):
                self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
    def test_batchnorm_fc_with_new_strategy(self):
-        # FIXME(zcd): close this test temporally.
+        # NOTE: the computation result of nccl_reduce is non-deterministic,
-        # self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
+        # related issue: https://github.com/NVIDIA/nccl/issues/157
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, True, 1e-5, 1e-2)
        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import paddle.fluid as fluid
+from paddle.fluid import compiler
 import paddle.fluid.core as core
 import numpy as np
 import unittest
@@ -61,22 +62,21 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
            exe.run(startup)
            feed_dict = {'image': image, 'label': label}
-            train_exe = fluid.ParallelExecutor(
+            train_cp = compiler.CompiledProgram(main).with_data_parallel(
-                use_cuda=use_cuda,
+                loss_name=loss.name, build_strategy=build_strategy)
+            test_cp = compiler.CompiledProgram(test_program).with_data_parallel(
                loss_name=loss.name,
-                main_program=main,
+                build_strategy=build_strategy,
-                build_strategy=build_strategy)
+                share_vars_from=train_cp)
-            test_exe = fluid.ParallelExecutor(
-                use_cuda=use_cuda,
-                main_program=test_program,
-                share_vars_from=train_exe,
-                build_strategy=build_strategy)
            for i in range(5):
-                test_loss, = test_exe.run([loss.name], feed=feed_dict)
+                exe.run(train_cp, feed=feed_dict, fetch_list=[loss.name])
+                test_loss, = exe.run(test_cp,
-                train_loss, = train_exe.run([loss.name], feed=feed_dict)
+                                     feed=feed_dict,
+                                     fetch_list=[loss.name])
+                train_loss, = exe.run(train_cp,
+                                      feed=feed_dict,
+                                      fetch_list=[loss.name])
                avg_test_loss_val = np.array(test_loss).mean()
                if math.isnan(float(avg_test_loss_val)):

--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -220,7 +220,10 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
            feed_queue.close()
            self.validate()
-            if not use_decorate_paddle_reader:
+            if use_decorate_paddle_reader:
+                py_reader.exited = True
+                py_reader.thread.join()
+            else:
                thread.join()
    def validate(self):

--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -92,19 +92,10 @@ class TestReaderReset(unittest.TestCase):
                broadcasted_label = np.ones((ins_num, ) + tuple(
                    self.ins_shape)) * label_val.reshape((ins_num, 1))
                self.assertEqual(data_val.all(), broadcasted_label.all())
-                for l in label_val:
-                    self.assertFalse(data_appeared[l[0]])
-                    data_appeared[l[0]] = True
            except fluid.core.EOFException:
                pass_count += 1
-                if with_double_buffer:
-                    data_appeared = data_appeared[:-parallel_exe.device_count *
-                                                  self.batch_size]
-                for i in data_appeared:
-                    self.assertTrue(i)
                if pass_count < self.test_pass_num:
-                    data_appeared = [False] * self.total_ins_num
                    data_reader_handle.reset()
                else:
                    break

--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -22,6 +22,14 @@ import numpy
 import functools
+def convert_to_offset(lod):
+    offset = [[0] for i in lod]
+    for i, level in enumerate(lod):
+        for seq_len in level:
+            offset[i].append(offset[i][-1] + seq_len)
+    return offset
 class TestReorderLoDTensor(unittest.TestCase):
    num_seq = 5
    # [name, shape, lod_level] pair indicating data info of source and target
@@ -91,13 +99,6 @@ class TestReorderLoDTensor(unittest.TestCase):
            self.inputs[desc[0]] = tensor
    def reorder(self):
-        def convert_to_offset(lod):
-            offset_lod = [[0] for i in lod]
-            for i, level in enumerate(lod):
-                for seq_len in level:
-                    offset_lod[i].append(offset_lod[i][-1] + seq_len)
-            return offset_lod
        level = 0
        # compute the rank_table according to ref_lod
        ref_lod = self.data[self.data_desc[1][0]][1][level]

--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -17,33 +17,43 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+from test_reorder_lod_tensor import convert_to_offset
-class TestSeqAvgPool(OpTest):
+def compute_seqpool_sum(x, offset, out):
-    def convert_to_offset(self, lod):
+    for i in range(len(offset[0]) - 1):
-        offset = [[0] for i in lod]
+        sub_x = x[offset[0][i]:offset[0][i + 1], :]
-        for i, level in enumerate(lod):
+        out[i] = sub_x.sum(axis=0)
-            for seq_len in level:
-                offset[i].append(offset[i][-1] + seq_len)
-        return offset
+def compute_seqpool_avg(x, offset, out):
+    for i in range(len(offset[0]) - 1):
+        sub_x = x[offset[0][i]:offset[0][i + 1], :]
+        out[i] = sub_x.mean(axis=0)
+def compute_seqpool_sqrt(x, offset, out):
+    for i in range(len(offset[0]) - 1):
+        sub_x = x[offset[0][i]:offset[0][i + 1], :]
+        seq_len = offset[0][i + 1] - offset[0][i]
+        out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
+class TestSeqAvgPool(OpTest):
    def set_data(self):
        self.op_type = 'sequence_pool'
        # one level, batch size is 4
        x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
        lod = [[11]]
        self.inputs = {'X': (x, lod)}
-        offset = self.convert_to_offset(lod)
+        offset = convert_to_offset(lod)
        out = np.zeros((len(lod[0]), 23)).astype('float32')
        self.outputs = {'Out': out}
        return x, offset, out
    def compute(self, x, offset, out):
        self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(len(offset[0]) - 1):
+        compute_seqpool_avg(x, offset, out)
-            sub_x = x[offset[0][i]:offset[0][i + 1], :]
-            out[i] = sub_x.mean(axis=0)
    def setUp(self):
        x, offset, out = self.set_data()
@@ -62,9 +72,7 @@ class TestSeqAvgPool(OpTest):
 class TestSeqSumPool(TestSeqAvgPool):
    def compute(self, x, offset, out):
        self.attrs = {'pooltype': "SUM"}
-        for i in range(len(offset[0]) - 1):
+        compute_seqpool_sum(x, offset, out)
-            sub_x = x[offset[0][i]:offset[0][i + 1], :]
-            out[i] = sub_x.sum(axis=0)
 class TestSeqMaxPool(TestSeqAvgPool):
@@ -72,7 +80,7 @@ class TestSeqMaxPool(TestSeqAvgPool):
        self.op_type = 'sequence_pool'
        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
        lod = [[13]]
-        offset = self.convert_to_offset(lod)
+        offset = convert_to_offset(lod)
        for i in range(len(offset[0]) - 1):
            l = offset[0][i + 1] - offset[0][i]
            x[offset[0][i] + np.random.randint(l), :] += 2.0
@@ -93,10 +101,7 @@ class TestSeqMaxPool(TestSeqAvgPool):
 class TestSeqSqrtPool(TestSeqAvgPool):
    def compute(self, x, offset, out):
        self.attrs = {'pooltype': "SQRT"}
-        for i in range(len(offset[0]) - 1):
+        compute_seqpool_sqrt(x, offset, out)
-            sub_x = x[offset[0][i]:offset[0][i + 1], :]
-            seq_len = offset[0][i + 1] - offset[0][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
 class TestSeqLastPool(TestSeqAvgPool):
@@ -122,7 +127,7 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
        x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
        lod = [[4, 1, 3, 5]]
        self.inputs = {'X': (x, lod)}
-        offset = self.convert_to_offset(lod)
+        offset = convert_to_offset(lod)
        out = np.zeros((4, 3, 17)).astype('float32')
        self.outputs = {'Out': out}
@@ -167,7 +172,7 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
        x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
        lod = [[4, 1, 3, 5]]
        self.inputs = {'X': (x, lod)}
-        offset = self.convert_to_offset(lod)
+        offset = convert_to_offset(lod)
        for i in range(len(offset[0]) - 1):
            l = offset[0][i + 1] - offset[0][i]
            x[offset[0][i] + np.random.randint(l), :] += 1.0

--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -28,6 +28,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
    def initParams(self):
        self.numeric_stable_mode = False
+        self.dtype = np.float64
    def setUp(self):
        self.initParams()
@@ -36,19 +37,19 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        class_num = 37
        logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float64")
+                                   [batch_size, class_num]).astype(self.dtype)
        softmax = np.apply_along_axis(stable_softmax, 1, logits)
        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
        cross_entropy = np.asmatrix(
            [[-np.log(softmax[i][labels[i][0]])]
             for i in range(softmax.shape[0])],
-            dtype="float64")
+            dtype=self.dtype)
        self.inputs = {"Logits": logits, "Label": labels}
        self.outputs = {
-            "Softmax": softmax.astype("float64"),
+            "Softmax": softmax.astype(self.dtype),
-            "Loss": cross_entropy.astype("float64")
+            "Loss": cross_entropy.astype(self.dtype)
        }
        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
@@ -56,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        self.check_output()
    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss")
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
 class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
@@ -64,6 +65,55 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
        self.numeric_stable_mode = True
+class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp):
+    def initParams(self):
+        self.numeric_stable_mode = False
+        self.dtype = np.float16
+    def setUp(self):
+        self.initParams()
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = 41
+        class_num = 37
+        # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype(np.float32)
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax[i][labels[i][0]])]
+             for i in range(softmax.shape[0])],
+            dtype=np.float32)
+        self.inputs = {
+            "Logits": logits.astype(self.dtype).view(np.uint16),
+            "Label": labels
+        }
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": cross_entropy.astype(self.dtype)
+        }
+        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
+class TestSoftmaxWithCrossEntropyOpNoCudnnFp16(
+        TestSoftmaxWithCrossEntropyOpFp16):
+    def initParams(self):
+        self.numeric_stable_mode = True
+        self.dtype = np.float16
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
 class TestSoftmaxWithCrossEntropyOp2(OpTest):
    """
    Test softmax with cross entropy operator with soft labels.

--- a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from math import log
+from math import exp
+from op_test import OpTest
+from scipy.special import logit
+from scipy.special import expit
+import unittest
+class TestTeacherStudentSigmoidLossOp(OpTest):
+    """
+        Test teacher_student_sigmoid_loss with discrete one-hot labels.
+    """
+    def setUp(self):
+        self.op_type = "teacher_student_sigmoid_loss"
+        batch_size = 16
+        num_classes = 1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.uniform(0, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        outs = []
+        for index, label in enumerate(self.inputs["Label"]):
+            x = self.inputs["X"][index]
+            if label < -1.0:
+                outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))))
+            elif label < 0.0:
+                outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))))
+            elif label < 1.0:
+                outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))) + \
+                            max(x, 0.0) - x * label + log(1.0 + exp(-abs(x))))
+            else:
+                outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))) + \
+                            max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x))))
+        self.outputs = {'Y': np.array(outs)}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.005)
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -752,12 +752,6 @@ class DistributeTranspiler(object):
            elif op not in lr_ops:
                self._append_pserver_non_opt_ops(block, op)
-        def __op_have_grad_input__(op):
-            for varname in op.input_arg_names:
-                if varname.find("@GRAD") >= 0:
-                    return varname
-            return ""
        def __clone_lr_op_sub_block__(op, program, lr_block):
            if not op.has_attr('sub_block'):
                return
@@ -808,7 +802,7 @@ class DistributeTranspiler(object):
            merged_var = None
            for _, op in enumerate(self.optimize_ops):
                # find the origin grad var before clipping/L2Decay,
-                # merged_var should be the input var name of L2Decaybuil
+                # merged_var should be the input var name of L2Decay
                grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
                if op.attr(OP_ROLE_VAR_ATTR_NAME)[
                        0] == optimize_target_param_name:
@@ -1684,7 +1678,16 @@ class DistributeTranspiler(object):
                if self.config.enable_dc_asgd:
                    new_inputs[key] = dc
                else:
-                    new_inputs[key] = merged_var
+                    # Note!! This is for l2decay on sparse gradient, because it will create a new tensor for
+                    # decayed gradient but not inplace modify the origin one
+                    origin_grad_name = opt_op.input(key)[0]
+                    if core.kNewGradSuffix(
+                    ) in origin_grad_name and pserver_block.has_var(
+                            origin_grad_name):
+                        new_grad = pserver_block.var(origin_grad_name)
+                        new_inputs[key] = new_grad
+                    else:
+                        new_inputs[key] = merged_var
            elif key == "Param":
                param_block = _get_param_block(opt_op)
                if not param_block: