Merge branch 'develop' of https://github.com/paddlepaddle/paddle into fix_avg_pool_trt_bug

test=develop

Merge branch 'develop' of https://github.com/paddlepaddle/paddle into fix_avg_pool_trt_bug
test=develop
83f8c403 · nhzlx · b9691169 · 2f27c048 · 83f8c403 · 83f8c403
152 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
 endif(WIN32)
 if(NOT CMAKE_CROSSCOMPILING)
@@ -66,6 +71,8 @@ option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
+option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
+option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization."               OFF)
@@ -308,7 +315,6 @@ endif()
 if (ON_INFER)
    message(STATUS "On inference mode, will take place some specific optimization.")
-    add_definitions(-DPADDLE_ON_INFERENCE)
 else()
    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -218,3 +218,7 @@ endif(WITH_GRPC)
 if(WITH_BRPC_RDMA)
    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
+if(ON_INFER)
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif(ON_INFER)
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
    # TODO(panyx0718): CUPTI only allows DSO?
    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    if(WIN32)
+      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(WIN32)
 endif(NOT WITH_DSO)
 # setting nvcc arch flags
@@ -196,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 else(NOT WIN32)
-if(CMAKE_BUILD_TYPE STREQUAL "Release")
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
 else()
-  message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
 endif(NOT WIN32)

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -2,7 +2,12 @@ if(NOT WITH_GPU)
    return()
 endif()
-set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
+if(WIN32)
+    set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+else(WIN32)
+    set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
+endif(WIN32)
 find_path(CUDNN_INCLUDE_DIR cudnn.h
    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -58,19 +58,21 @@ ExternalProject_Add(
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
+                        -DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN}
+                        -DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )
 message(STATUS "Anakin for inference is enabled")
 message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+add_dependencies(extern_anakin protobuf mklml)
 add_library(anakin_shared SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
-add_dependencies(anakin_shared extern_anakin protobuf mklml)
+add_dependencies(anakin_shared extern_anakin)
 add_library(anakin_saber SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
-add_dependencies(anakin_saber extern_anakin protobuf mklml)
+add_dependencies(anakin_saber extern_anakin)
 list(APPEND external_project_dependencies anakin_shared anakin_saber)
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -28,34 +28,28 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
-IF (WIN32)
-    MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
+MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
-else()
-    MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
-ENDIF(WIN32)
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
-set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
-set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
+set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 include_directories(${BOOST_INCLUDE_DIR})
-if (NOT WIN32)
 ExternalProject_Add(
    ${BOOST_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
+    URL      ${BOOST_URL}
-    && tar zxf ${BOOST_TAR}.tar.gz
    DOWNLOAD_NO_PROGRESS  1
    PREFIX                ${BOOST_SOURCES_DIR}
    CONFIGURE_COMMAND     ""
    BUILD_COMMAND         ""
    INSTALL_COMMAND       ""
    UPDATE_COMMAND        ""
-)
+    )
-endif(NOT WIN32)
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -35,7 +35,12 @@ ExternalProject_Add(
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DBUILD_STATIC_LIBS=ON
                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DBUILD_TESTING=OFF
@@ -48,8 +53,8 @@ ExternalProject_Add(
 IF(WIN32)
  IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
    add_custom_command(TARGET extern_gflags POST_BUILD
-    COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
+            COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
-  )
+            )
  ENDIF()
 ENDIF(WIN32)
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -46,7 +46,11 @@ ExternalProject_Add(
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
@@ -63,7 +67,7 @@ ExternalProject_Add(
 IF(WIN32)
  IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
    add_custom_command(TARGET extern_glog POST_BUILD
-    COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
+    COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
  )
  ENDIF()
 ENDIF(WIN32)

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS)
 ENDIF(USE_EIGEN_FOR_BLAS)
 INCLUDE(cblas)
-# IF(WIN32 AND NOT ${CBLAS_FOUND})
 IF(NOT ${CBLAS_FOUND})
    INCLUDE(ExternalProject)
    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@@ -34,6 +30,7 @@ IF(NOT ${CBLAS_FOUND})
        CACHE FILEPATH "openblas library." FORCE)
    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
    IF (WIN32)
        SET(CBLAS_FOUND true)
        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -140,7 +140,6 @@ endmacro()
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 IF (WIN32)
    SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
-    MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
 ENDIF(WIN32)
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
@@ -188,13 +187,20 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        SET(OPTIONAL_ARGS
            "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
            "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
            "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+            "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+            "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
+            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+            "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+            "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
            "-Dprotobuf_WITH_ZLIB=ON"
            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
            ${EXTERNAL_OPTIONAL_ARGS})
        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
    ENDIF()
+    IF(WIN32)
+        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
+    ENDIF()
    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -21,6 +21,48 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp ${PY_VERSION})
 FIND_PACKAGE(PythonLibs ${PY_VERSION})
+if(WIN32)
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+"from distutils import sysconfig as s;import sys;import struct;
+print(sys.prefix);
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+"
+            RESULT_VARIABLE _PYTHON_SUCCESS
+            OUTPUT_VARIABLE _PYTHON_VALUES
+            ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+    if(NOT _PYTHON_SUCCESS MATCHES 0)
+        set(PYTHONLIBS_FOUND FALSE)
+        return()
+    endif()
+    # Convert the process output into a list
+    string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+    string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+    list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
+    list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
+    # Make sure all directory separators are '/'
+    string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
+    set(PYTHON_LIBRARY
+            "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+        set(PYTHON_LIBRARY
+                "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+    endif()
+    # raise an error if the python libs are still not found.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        message(FATAL_ERROR "Python libraries not found")
+    endif()
+    SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+endif(WIN32)
 # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
 ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})

--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -14,23 +14,52 @@ ELSE()
  ENDIF(APPLE)
 ENDIF()
-ExternalProject_Add(
+if(WIN32)
-    extern_xxhash
+  ExternalProject_Add(
-    ${EXTERNAL_PROJECT_LOG_ARGS}
+          extern_xxhash
-    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+          ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_TAG         "v0.6.5"
+          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
-    PREFIX          ${XXHASH_SOURCE_DIR}
+          GIT_TAG         "v0.6.5"
-    DOWNLOAD_NAME   "xxhash"
+          PREFIX          ${XXHASH_SOURCE_DIR}
-    UPDATE_COMMAND  ""
+          DOWNLOAD_NAME   "xxhash"
-    CONFIGURE_COMMAND ""
+          UPDATE_COMMAND  ""
-    BUILD_IN_SOURCE 1
+          BUILD_IN_SOURCE 1
-    PATCH_COMMAND
+          PATCH_COMMAND
-    BUILD_COMMAND     ${BUILD_CMD}
+          CONFIGURE_COMMAND
-    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+          ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial
-    TEST_COMMAND      ""
+          -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
-)
+          -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+          -DBUILD_XXHSUM=OFF
+          -DCMAKE_GENERATOR_PLATFORM=x64
+          -DBUILD_SHARED_LIBS=OFF
+          ${OPTIONAL_CACHE_ARGS}
+          TEST_COMMAND      ""
+  )
+else()
+  ExternalProject_Add(
+      extern_xxhash
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+      GIT_TAG         "v0.6.5"
+      PREFIX          ${XXHASH_SOURCE_DIR}
+      DOWNLOAD_NAME   "xxhash"
+      UPDATE_COMMAND  ""
+      CONFIGURE_COMMAND ""
+      BUILD_IN_SOURCE 1
+      PATCH_COMMAND
+      BUILD_COMMAND     ${BUILD_CMD}
+      INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+      TEST_COMMAND      ""
+  )
+endif()
-set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+if (WIN32)
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
+else()
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+endif ()
 INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
 add_library(xxhash STATIC IMPORTED GLOBAL)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -266,7 +266,11 @@ function(cc_library TARGET_NAME)
      if("${cc_library_DEPS};" MATCHES "python;")
        list(REMOVE_ITEM cc_library_DEPS python)
        add_dependencies(${TARGET_NAME} python)
-        target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+        if(WIN32)
+          target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
+        else()
+          target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+        endif(WIN32)
      endif()
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
@@ -288,6 +292,45 @@ function(cc_library TARGET_NAME)
  endif(cc_library_SRCS)
 endfunction(cc_library)
+# The link operation under windows may exceeds the maximum characters limit, simply break the link command
+# into multiple link opeartion can fix that, say
+# original:
+#     lib /out:target.lib a.lib b.lib c.lib d.lib
+# after:
+#    1. lib /out:dummy_lib_1.lib a.lib b.lib
+#    2. lib /out:dummy_lib_2.lib c.lib d.lib
+#    1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib
+function(sep_library TARGET_NAME)
+  set(options STATIC static SHARED shared)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(dummy_index 1)
+  set(dummy_offset 1)
+  # the dummy target would be consisted of limit size libraries
+  set(dummy_limit 50)
+  list(LENGTH sep_library_DEPS sep_all_len)
+  foreach(v ${sep_library_DEPS})
+    list(APPEND dummy_list ${v})
+    list(LENGTH dummy_list listlen )
+    if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len}))
+      message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}")
+      cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list})
+      foreach(i ${dummy_list})
+        list(REMOVE_AT dummy_list 0)
+      endforeach()
+      list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index})
+      MATH(EXPR dummy_index "${dummy_index}+1")
+    endif()
+    MATH(EXPR dummy_offset "${dummy_offset}+1")
+  endforeach()
+  if(${sep_library_SHARED})
+    cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
+  else(${sep_library_SHARED})
+    cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
+  endif(${sep_library_SHARED})
+endfunction(sep_library)
 function(cc_binary TARGET_NAME)
  set(options "")
  set(oneValueArgs "")

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -22,144 +22,165 @@ function(copy TARGET)
    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
-    if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+    if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
-    endif()
+    endif ()
    math(EXPR len "${copy_lib_SRCS_len} - 1")
    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
-    foreach(index RANGE ${len})
+    foreach (index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
        list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD
+        if (WIN32)
-          COMMAND mkdir -p "${dst}"
+            # windows cmd shell will not expand wildcard automatically.
-          COMMAND cp -r "${src}" "${dst}"
+            # below expand the files,libs and copy them by rules.
-          COMMENT "copying ${src} -> ${dst}")
+            file(GLOB header_files ${src} "*.h")
-    endforeach()
+            file(GLOB static_lib_files ${src} "*.lib")
+            file(GLOB dll_lib_files ${src} "*.dll")
+            set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
+            if (NOT "${src_files}" STREQUAL "")
+                list(REMOVE_DUPLICATES src_files)
+            endif ()
+            add_custom_command(TARGET ${TARGET} PRE_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+                    )
+            foreach (src_file ${src_files})
+                add_custom_command(TARGET ${TARGET} PRE_BUILD
+                        COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
+                        COMMENT "copying ${src_file} -> ${dst}")
+            endforeach ()
+        else (WIN32) # not windows
+            add_custom_command(TARGET ${TARGET} PRE_BUILD
+                    COMMAND mkdir -p "${dst}"
+                    COMMAND cp -r "${src}" "${dst}"
+                    COMMENT "copying ${src} -> ${dst}")
+        endif (WIN32) # not windows
+    endforeach ()
 endfunction()
 # third party
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
 copy(eigen3_lib
-  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+        SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
-  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+        DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
-  DEPS eigen3
+        DEPS eigen3
-)
+        )
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
 copy(gflags_lib
-  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+        SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
-  DSTS ${dst_dir} ${dst_dir}/lib
+        DSTS ${dst_dir} ${dst_dir}/lib
-  DEPS gflags
+        DEPS gflags
-)
+        )
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
 copy(glog_lib
-  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+        SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
-  DSTS ${dst_dir} ${dst_dir}/lib
+        DSTS ${dst_dir} ${dst_dir}/lib
-  DEPS glog
+        DEPS glog
-)
+        )
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
 copy(boost_lib
-  SRCS ${BOOST_INCLUDE_DIR}/boost
+        SRCS ${BOOST_INCLUDE_DIR}/boost
-  DSTS ${dst_dir}
+        DSTS ${dst_dir}
-  DEPS boost
+        DEPS boost
-)
+        )
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
 copy(xxhash_lib
-  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+        SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
-  DSTS ${dst_dir} ${dst_dir}/lib
+        DSTS ${dst_dir} ${dst_dir}/lib
-  DEPS xxhash
+        DEPS xxhash
-)
+        )
-if(NOT PROTOBUF_FOUND)
+if (NOT PROTOBUF_FOUND)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
    copy(protobuf_lib
-      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
+            SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
-      DSTS ${dst_dir} ${dst_dir}/lib
+            DSTS ${dst_dir} ${dst_dir}/lib
-      DEPS extern_protobuf
+            DEPS extern_protobuf
-    )
+            )
-endif()
+endif ()
-if(NOT CBLAS_FOUND)
+if (NOT CBLAS_FOUND)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
    copy(openblas_lib
-      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-      DSTS ${dst_dir} ${dst_dir}
+            DSTS ${dst_dir} ${dst_dir}
-      DEPS extern_openblas
+            DEPS extern_openblas
-    )
+            )
 elseif (WITH_MKLML)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
    copy(mklml_lib
-      SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+            SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
-      DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+            DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
-      DEPS mklml
+            DEPS mklml
-    )
+            )
-endif()
+endif ()
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
-  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
-  copy(mkldnn_lib
+    copy(mkldnn_lib
-    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
+            SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
-    DSTS ${dst_dir} ${dst_dir}/lib
+            DSTS ${dst_dir} ${dst_dir}/lib
-    DEPS mkldnn
+            DEPS mkldnn
-  )
+            )
-endif()
+endif ()
 if (NOT WIN32)
-if(NOT MOBILE_INFERENCE AND NOT RPI)
+    if (NOT MOBILE_INFERENCE AND NOT RPI)
-  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
-  copy(snappy_lib
+        copy(snappy_lib
-    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+                SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib
+                DSTS ${dst_dir} ${dst_dir}/lib
-    DEPS snappy)
+                DEPS snappy)
-  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
-  copy(snappystream_lib
+        copy(snappystream_lib
-    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+                SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib
+                DSTS ${dst_dir} ${dst_dir}/lib
-    DEPS snappystream)
+                DEPS snappystream)
-  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
-  copy(zlib_lib
+        copy(zlib_lib
-    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+                SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib
+                DSTS ${dst_dir} ${dst_dir}/lib
-    DEPS zlib)
+                DEPS zlib)
-endif()
+    endif ()
-endif(NOT WIN32)
+endif (NOT WIN32)
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
-set(framework_lib_deps framework_py_proto)
+    set(framework_lib_deps framework_py_proto)
-endif(NOT WIN32)
+endif (NOT WIN32)
 copy(framework_lib DEPS ${framework_lib_deps}
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-       ${src_dir}/${module}/ir/*.h
+        ${src_dir}/${module}/ir/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
-)
+        )
 set(module "memory")
 copy(memory_lib
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
-)
+        )
 set(inference_deps paddle_fluid_shared paddle_fluid)
 set(module "inference/api")
 if (WITH_ANAKIN AND WITH_MKL)
    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
-        SRCS
+            SRCS
-        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
+            ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
-        ${ANAKIN_INSTALL_DIR} # anakin release
+            ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
+            DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
-     list(APPEND inference_deps anakin_inference_lib)
+    list(APPEND inference_deps anakin_inference_lib)
-endif()
+endif ()
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
@@ -167,30 +188,30 @@ copy(inference_lib DEPS ${inference_deps}
       ${src_dir}/${module}/api/paddle_*.h
       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
-)
+        )
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
-)
+        )
 set(module "string")
 copy(string_lib
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
-)
+        )
 set(module "pybind")
 copy(pybind_lib
-  SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
+        SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
-  DSTS ${dst_dir}/${module}
+        DSTS ${dst_dir}/${module}
-)
+        )
 # CMakeCache Info
 copy(cmake_cache
-  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+        SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-  DSTS ${FLUID_INSTALL_DIR})
+        DSTS ${FLUID_INSTALL_DIR})
 # This command generates a complete fluid library for both train and inference
 add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
@@ -198,9 +219,9 @@ add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
 # Following commands generate a inference-only fluid library
 # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
 copy(third_party DEPS fluid_lib_dist
-  SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
+        SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
-  DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
+        DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
-)
+        )
 # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
 copy(inference_api_lib DEPS fluid_lib_dist
@@ -213,20 +234,20 @@ add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
 # paddle fluid version
 function(version version_file)
-  execute_process(
+    execute_process(
-    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+            COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+            OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-  file(WRITE ${version_file}
+    file(WRITE ${version_file}
-    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+            "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-    "WITH_MKL: ${WITH_MKL}\n"
+            "WITH_MKL: ${WITH_MKL}\n"
-    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
+            "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-    "WITH_GPU: ${WITH_GPU}\n")
+            "WITH_GPU: ${WITH_GPU}\n")
-  if(WITH_GPU)
+    if (WITH_GPU)
-    file(APPEND ${version_file}
+        file(APPEND ${version_file}
-      "CUDA version: ${CUDA_VERSION}\n"
+                "CUDA version: ${CUDA_VERSION}\n"
-      "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+                "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
-  endif()
+    endif ()
 endfunction()
 version(${FLUID_INSTALL_DIR}/version.txt)
 version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
--- a/doc/v2/dev/contribute_to_paddle_en.md
+++ b/doc/v2/dev/contribute_to_paddle_en.md
 ../../../CONTRIBUTING.md
\ No newline at end of file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -93,7 +93,7 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized',
 paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
 paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
 paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
+paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False))
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
@@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates',
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -4,11 +4,12 @@ add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(string)
-if (NOT WIN32)
 add_subdirectory(pybind)
+if (NOT WIN32)
 add_subdirectory(recordio)
 endif(NOT WIN32)
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
 add_subdirectory(train)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -136,20 +136,32 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+if(NOT WIN32)
+cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+  shape_inference data_transform lod_tensor profiler)
+endif(NOT WIN32)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-if (NOT WIN32)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
-add_custom_command(TARGET framework_py_proto POST_BUILD
+if (NOT WIN32)
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+  add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+      COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+      COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+  string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
+  add_custom_command(TARGET framework_py_proto POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+          COMMAND copy /Y *.py ${proto_dstpath}
+          COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
@@ -163,10 +175,14 @@ if(WITH_DISTRIBUTE)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  if(NOT WIN32)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
+  else(NOT WIN32)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  endif(NOT WIN32)
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
  BuildStrategy strategy_;
 };
-std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
-    const {
+    bool finalize_strategy) const {
+  if (is_finalized_) {
+    return pass_builder_;
+  }
  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  if (finalize_strategy) {
+    is_finalized_ = true;
+  }
  return pass_builder_;
 }
@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 #else
    const bool use_cuda) const {
 #endif
-  // Create a default one if not initialized by user.
+  // Create a default one if not finalized by user.
-  if (!pass_builder_) {
+  CreatePassesFromStrategy(false);
-    CreatePassesFromStrategy();
-  }
  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -75,12 +75,20 @@ struct BuildStrategy {
  bool remove_unnecessary_lock_{false};
+  // NOTE:
+  // Before you add new options, think if it's a general strategy that works
+  // with other strategy. If not, the strategy should be created through
+  // CreatePassesFromStrategy and the pass can be managed separately.
  // User normally doesn't need to call this API.
  // The PassBuilder allows for more customized insert, remove of passes
  // from python side.
  // A new PassBuilder is created based on configs defined above and
  // passes are owned by the PassBuilder.
-  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
+      bool finalize_strategy) const;
+  bool IsFinalized() const { return is_finalized_; }
  // Apply the passes built by the pass_builder_. The passes will be
  // applied to the Program and output an ir::Graph.
@@ -97,6 +105,7 @@ struct BuildStrategy {
 #endif
 private:
+  mutable bool is_finalized_ = false;
  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/macros.h"
@@ -25,6 +26,7 @@ limitations under the License. */
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 namespace paddle {
 namespace framework {
@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
  }
 }
+static void EnableFusedOp(ExecutorPrepareContext* ctx) {
+#ifdef PADDLE_WITH_NGRAPH
+  VLOG(3) << "use_ngraph=True";
+  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  for (auto& interval : intervals) {
+    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
+                                       interval.at(0), interval.at(1));
+    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+  }
+  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+    ctx->ops_.erase(it->at(0) + 1, it->at(1));
+  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
 Executor::Executor(const platform::Place& place) : place_(place) {}
 void Executor::Close() {
@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
+  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
  return ctx;
 }
@@ -486,6 +507,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -29,7 +29,7 @@ template <typename T>
 class GarbageCollector {
 public:
  GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) {
+      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
    garbages_.reset(new std::deque<T *>());
    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
  }

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -41,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
 pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
+pass_library(is_test_pass base)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)
@@ -62,6 +63,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)

--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
  VLOG(30) << "LSTMWeight resized to " << out->dims();
  float* out_data = out->mutable_data<float>(platform::CPUPlace());
-  std::array<const float*, 4> tensors(
+  std::array<const float*, 4> tensors{
-      {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
+      W_forget_w0.data<float>(), W_input_w0.data<float>(),
-        W_output_w0.data<float>(), W_cell_w0.data<float>()}});
+      W_output_w0.data<float>(), W_cell_w0.data<float>()};
-  std::array<const float*, 4> tensors1(
+  std::array<const float*, 4> tensors1{
-      {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
+      W_forget_w1.data<float>(), W_input_w1.data<float>(),
-        W_output_w1.data<float>(), W_cell_w1.data<float>()}});
+      W_output_w1.data<float>(), W_cell_w1.data<float>()};
  for (int row = 0; row < D; row++) {
    for (int col = 0; col < 4; col++) {
@@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
 void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                     const LoDTensor& B_output, const LoDTensor& B_cell,
                     LoDTensor* out) {
-  std::array<const float*, 4> tensors(
+  std::array<const float*, 4> tensors{
-      {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+      B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-        B_cell.data<float>()}});
+      B_cell.data<float>()};
  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
  int D = B_forget.dims()[0];

--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
    desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
    desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
    desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
+    desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
    desc.SetType("fc");
    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});

--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
  if (type == "mul") {
    op->SetInput("X", {inputs[0]});
    op->SetInput("Y", {inputs[1]});
+    op->SetAttr("x_num_col_dims", {1});
  } else if (type == "elementwise_add") {
    op->SetInput("X", inputs);
  }

--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/is_test_pass.h"
+#include <string>
+#include <utility>
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
+             "for activations and pooling.";
+  auto op_list = {"pool2d",      "sigmoid",      "logsigmoid",
+                  "softshrink",  "exp",          "brelu",
+                  "pow",         "leaky_relu",   "stanh",
+                  "relu",        "tanh",         "tanh_shrink",
+                  "sqrt",        "abs",          "ceil",
+                  "elu",         "floor",        "cos",
+                  "sin",         "round",        "reciprocal",
+                  "hard_shrink", "hard_sigmoid", "relu6",
+                  "soft_relu",   "swish",        "thresholded_relu",
+                  "log",         "square",       "softplus",
+                  "softsign"};
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("is_test")) {
+        op->SetAttr("is_test", true);
+      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
+                 end(op_list)) {
+        op->MutableAttrMap()->insert(
+            std::pair<std::string, Attribute>("is_test", true));
+      }
+    }
+  }
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class IsTestPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/is_test_pass.h"
+#include <gtest/gtest.h>
+namespace paddle {
+namespace framework {
+namespace ir {
+enum class ISTEST_STATE { FALSE, TRUE, UNSET };
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false,
+           ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("name", name);
+  op->SetInput("X", inputs);
+  op->SetOutput("Out", outputs);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  if (is_test == ISTEST_STATE::UNSET)
+    op->MutableAttrMap()->erase("is_test");
+  else if (is_test == ISTEST_STATE::FALSE)
+    op->SetAttr("is_test", false);
+  else
+    op->SetAttr("is_test", true);
+}
+// a->pool2d->b
+// b->relu->c
+// c,weights1)->conv2d->d
+//
+// d->pool2d->e
+// e->hard_sigmoid->f
+// (f,weights2)->conv2d->g
+//
+// g->pool2d->h
+// h->tanh->i
+// (i,weights3)->conv2d->j
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
+                                 "j", "weights1", "weights2", "weights3"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights1" || v == "weights2" || v == "weights3") {
+      var->SetPersistable(true);
+    }
+  }
+  SetOp(&prog, "pool2d", "pooling1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "relu", "activation1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "conv2d", "conv1", std::vector<std::string>({"c", "weights1"}),
+        std::vector<std::string>({"d"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "pool2d", "pooling2", std::vector<std::string>({"d"}),
+        std::vector<std::string>({"e"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "hard_sigmoid", "activation2", std::vector<std::string>({"e"}),
+        std::vector<std::string>({"f"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "conv2d", "conv2", std::vector<std::string>({"f", "weights2"}),
+        std::vector<std::string>({"g"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "pool2d", "pooling3", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}), false, ISTEST_STATE::UNSET);
+  SetOp(&prog, "tanh", "activation3", std::vector<std::string>({"h"}),
+        std::vector<std::string>({"i"}), true, ISTEST_STATE::UNSET);
+  SetOp(&prog, "conv2d", "conv3", std::vector<std::string>({"i", "weights3"}),
+        std::vector<std::string>({"j"}), false, ISTEST_STATE::UNSET);
+  return prog;
+}
+TEST(IsTestPass, basic) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass = PassRegistry::Instance().Get("is_test_pass");
+  graph = pass->Apply(std::move(graph));
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv3") {
+        ASSERT_FALSE(op->HasAttr("is_test"));
+      } else {
+        ASSERT_TRUE(op->HasAttr("is_test"));
+        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
+      }
+    }
+  }
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(is_test_pass);
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -17,7 +17,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+// msvc15 don't support constexpr in correct way.
+#if !defined(_WIN32)
 constexpr char Node::kControlDepVarName[];
+#else
+const char Node::kControlDepVarName[] = "__control_var";
+#endif
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                        Node::Type type) {

--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -55,7 +55,11 @@ class Node {
  }
  enum class Type { kOperation, kVariable };
+#if !defined(_WIN32)  // msvc not support constexpr correctly.
  static constexpr char kControlDepVarName[] = "__control_var";
+#else
+  static const char kControlDepVarName[];
+#endif
  Type NodeType() const { return type_; }

--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -197,26 +197,26 @@ struct PassRegistrar : public Registrar {
                msg)
 // Register a new pass that can be applied on the IR.
-#define REGISTER_PASS(pass_type, pass_class)                          \
+#define REGISTER_PASS(pass_type, pass_class)                \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                      \
-      __reg_pass__##pass_type,                                        \
+      __reg_pass__##pass_type,                              \
-      "REGISTER_PASS must be called in global namespace");            \
+      "REGISTER_PASS must be called in global namespace");  \
-  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+  static ::paddle::framework::ir::PassRegistrar<pass_class> \
-      __pass_registrar_##pass_type##__(#pass_type);                   \
+      __pass_registrar_##pass_type##__(#pass_type);         \
-  int TouchPassRegistrar_##pass_type() {                              \
+  int TouchPassRegistrar_##pass_type() {                    \
-    __pass_registrar_##pass_type##__.Touch();                         \
+    __pass_registrar_##pass_type##__.Touch();               \
-    return 0;                                                         \
+    return 0;                                               \
-  }                                                                   \
+  }                                                         \
-  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+  static ::paddle::framework::ir::PassRegistrar<pass_class> \
-      &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \
+      &__pass_tmp_registrar_##pass_type##__ UNUSED =        \
          __pass_registrar_##pass_type##__
-#define USE_PASS(pass_type)                                           \
+#define USE_PASS(pass_type)                           \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                \
-      __use_pass_itself_##pass_type,                                  \
+      __use_pass_itself_##pass_type,                  \
-      "USE_PASS must be called in global namespace");                 \
+      "USE_PASS must be called in global namespace"); \
-  extern int TouchPassRegistrar_##pass_type();                        \
+  extern int TouchPassRegistrar_##pass_type();        \
-  static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
+  static int use_pass_itself_##pass_type##_ UNUSED =  \
      TouchPassRegistrar_##pass_type()
 }  // namespace ir

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 }
 void NaiveExecutor::Run() {
+#ifndef PADDLE_ON_INFERENCE
+  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
+                              "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
+                              "variables will be reused to save the allocation "
+                              "overhead.";
+  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
+                              "setting the cmake flag ON_INFER=ON if you are "
+                              "running Paddle Inference";
+#endif  // PADDLE_ON_INFERENCE
  for (auto &op : ops_) {
    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
            << " on scope " << scope_;

--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_NGRAPH
+#include <algorithm>
+#include <functional>
+#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "ngraph/ngraph.hpp"
+namespace paddle {
+namespace framework {
+std::map<std::string,
+         std::function<void(const std::shared_ptr<OperatorBase>&,
+                            std::shared_ptr<std::unordered_map<
+                                std::string, std::shared_ptr<ngraph::Node>>>)>>
+    NgraphBridge::NG_NODE_MAP = {};
+void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+  auto& op_type = op->Type();
+  NG_NODE_MAP[op_type](op, ngb_node_map);
+}
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_NGRAPH
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "ngraph/ngraph.hpp"
+namespace paddle {
+namespace framework {
+class NgraphBridge {
+ public:
+  static std::map<
+      std::string,
+      std::function<void(const std::shared_ptr<OperatorBase>&,
+                         std::shared_ptr<std::unordered_map<
+                             std::string, std::shared_ptr<ngraph::Node>>>)>>
+      NG_NODE_MAP;
+  explicit NgraphBridge(
+      std::shared_ptr<
+          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+          var_node_map)
+      : ngb_node_map(var_node_map) {}
+  void build_graph(const std::shared_ptr<OperatorBase>& op);
+ private:
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      ngb_node_map;
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_NGRAPH
+#include <glog/logging.h>
+#include <algorithm>
+#include <map>
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type.h"
+namespace paddle {
+namespace framework {
+static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
+    {proto::VarType::FP32, ngraph::element::f32},
+    {proto::VarType::FP64, ngraph::element::f64},
+    {proto::VarType::INT32, ngraph::element::i32},
+    {proto::VarType::INT64, ngraph::element::i64},
+    {proto::VarType::BOOL, ngraph::element::boolean},
+};
+typedef enum {                /* nGraph support state on ops          */
+               FULL_TRAIN,    /* Support full ops for train           */
+               PARTIAL_TRAIN, /* Support partial ops for train        */
+               FULL_TEST,     /* Support full list of ops for test    */
+               PARTIAL_TEST   /* Support partial list of ops for test */
+} op_state;
+class NgraphOperator {
+ public:
+  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
+                          const std::vector<std::shared_ptr<OperatorBase>>& ops,
+                          const std::unordered_map<
+                              std::string, ngraph::element::Type>& var_type_map,
+                          const std::unordered_set<std::string>& persist,
+                          const std::unordered_set<std::string>& fetches,
+                          const std::unordered_set<std::string>& post_op_inputs,
+                          op_state ng_op_state)
+      : scope_(scope),
+        place_(place),
+        fused_ops_(ops),
+        var_type_map_(var_type_map),
+        persistables_(persist),
+        fetches_(fetches),
+        post_op_inputs_(post_op_inputs),
+        ng_op_state_(ng_op_state) {}
+  void Run(const Scope& scope, const platform::Place& place) const;
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache;
+  const Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  op_state ng_op_state_;
+};
+std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+FusedOperator::FusedOpIntervals(
+    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
+  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+      intervals;
+  if (ops->empty()) {
+    return intervals;
+  }
+  size_t size = ops->size();
+  size_t left = 0;
+  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+    ++left;
+  }
+  if (left == size) {
+    return intervals;
+  }
+  while (left < size && ops->at(left)->Type() == kFeedOpType) {
+    ++left;
+  }
+  size_t right = left;
+  while (right < size && ops->at(right)->Type() != kFetchOpType) {
+    ++right;
+  }
+  if (right == size) {
+    return intervals;
+  }
+  if (left >= right) return intervals;
+  // (left, right - 1) represents indices between feed and fetch
+  size_t pivot = left;
+  while (pivot < right) {
+    auto op_type = ops->at(pivot)->Type();
+    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
+        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
+      ++pivot;
+    } else {
+      size_t start = pivot, end = start;
+      while (pivot < right &&
+             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
+                  ops.at(pivot)->Type()) !=
+              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
+        ++pivot;
+        ++end;
+      }
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
+          interval = {ops->begin() + start, ops->begin() + end};
+      intervals.push_back(interval);
+    }
+  }  // end while
+  return intervals;
+}
+FusedOperator::FusedOperator(
+    const ProgramDesc& prog, size_t block_id,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, const AttributeMap& attrs)
+    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
+       it != end; ++it) {
+    fused_ops_.push_back(std::move(*it));
+  }
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
+       (*it)->Type() != kFetchOpType; ++it) {
+    for (auto& var_name_item : (*it)->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        post_op_inputs_.insert(var_name);
+      }
+    }
+  }
+  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
+    is_complete = true;
+  }
+  Process();
+}
+void FusedOperator::Process() {
+  auto& bdesc = pdesc_.Block(block_);
+  for (auto& var : bdesc.AllVars()) {
+    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
+          var->GetType() == proto::VarType::LOD_TENSOR ||
+          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
+      continue;
+    }
+    auto var_name = var->Name();
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+    if (var_name != "fetch" && var_name != "feed") {
+      auto pd_type = var->GetDataType();
+      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
+        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
+                     var_name);
+      }
+      var_type_map_[var_name] = pd2ng_type_map[pd_type];
+    }
+    if (var->Persistable()) {
+      persistables_.insert(var->Name());
+    }
+  }
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      fetches_.insert(fetch_target_name);
+    }
+  }
+}
+void FusedOperator::RunImpl(const Scope& scope,
+                            const platform::Place& place) const {
+  op_state ng_op_state = PARTIAL_TEST;
+  auto& bdesc = pdesc_.Block(block_);
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type().find("_grad") != std::string::npos) {
+      ng_op_state = PARTIAL_TRAIN;
+      break;
+    }
+  }
+  if (is_full) {
+    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
+  }
+  NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
+                           persistables_, fetches_, post_op_inputs_,
+                           ng_op_state);
+  ngraph_op.Run(scope, place);
+}
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_NGRAPH
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/variant.h"
+#include "ngraph/ngraph.hpp"
+namespace paddle {
+namespace framework {
+class FusedOperator : public OperatorBase {
+ public:
+  static std::vector<
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+  FusedOpIntervals(
+      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
+  explicit FusedOperator(
+      const ProgramDesc& prog, size_t block_id,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
+      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
+  void RunImpl(const Scope& scope, const platform::Place& place) const final;
+ private:
+  const ProgramDesc pdesc_;
+  size_t block_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  bool is_full_ = false;
+  void Process();
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -63,6 +63,8 @@ struct OpKernelType {
        place_(dev_ctx.GetPlace()),
        library_type_(library_type) {}
+  size_t hash_key() const { return Hash()(*this); }
  bool operator==(const OpKernelType& o) const {
    return platform::places_are_same_class(place_, o.place_) &&
           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
+// Combine two hash values to a single hash.
+inline size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -150,14 +155,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
  }
-  // The profile has a process-wide mutex, results in serious performance issue
+// The profile has a process-wide mutex, results in serious performance issue
-  // in concurrency scenerio. Here use an `if` to fix this issue.
+// in concurrency scenerio. Here use an `if` to fix this issue.
-  // Please not remove the `if`, ask @Superjomn if there are any concern.
+// Please not remove the `if`, ask @Superjomn if there are any concern.
+#ifndef _WIN32
  if (platform::IsProfileEnabled()) {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    platform::RecordEvent record_event(Type(), pool.Get(place));
    RunImpl(scope, place);
-  } else {
+  } else
+#endif
+  {
    RunImpl(scope, place);
  }
  VLOG(30) << place << " " << DebugStringEx(&scope);
@@ -791,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
    const Scope& scope, const OpKernelType& expected_kernel_key,
    std::vector<std::string>* transfered_inplace_vars) const {
+// In the inference scenerio, the scopes will be reused across the batches, so
+// the `new_scope` here will result in GPU memroy explosion over the running of
+// operators.
+// We use a thread_local cache to fix that issue, the key in the cache is the
+// combination of the `scope` argument, from_kernel_type, target_kernel_type.
+// Have a discussion with @Superjomn or the inference developers if some changes
+// on this logic for this macro might not tested on the other scenerios.
+#ifdef PADDLE_ON_INFERENCE
+  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
+#endif
  Scope* new_scope = nullptr;
  for (auto& var_name_item : Inputs()) {
    for (auto& var_name : var_name_item.second) {
@@ -821,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
      VLOG(30) << "Transform Variable " << var_name << " from "
               << kernel_type_for_var << " to " << expected_kernel_key;
+#ifdef PADDLE_ON_INFERENCE
+      size_t infer_cache_key =
+          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
+                      OpKernelType::Hash()(expected_kernel_key));
+      infer_cache_key =
+          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
+      auto it = infer_transfer_scope_cache.find(infer_cache_key);
+      if (it != infer_transfer_scope_cache.end()) {
+        new_scope = infer_transfer_scope_cache[infer_cache_key];
+      } else {
+        new_scope = &scope.NewScope();
+        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      }
+#endif
      if (new_scope == nullptr) {
        new_scope = &scope.NewScope();
      }
      auto* trans_var = new_scope->Var(var_name);
      Tensor out;
      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
      SetTensorToVariable(*var, out, trans_var);

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -42,7 +42,7 @@ DEFINE_double(
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
 #define SCOPE_LOCK_GUARD
 #else
 #define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -16,9 +16,21 @@ cc_library(paddle_fluid_api
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
+if (WIN32)
+list(APPEND fluid_third_partys gflags glog protobuf cblas)
+endif(WIN32)
 # paddle_fluid_origin exclude inference api interface
-cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+if(WIN32)
+  sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
+else(WIN32)
+  cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+endif(WIN32)
 add_subdirectory(api)
@@ -28,8 +40,16 @@ set(SHARED_INFERENCE_SRCS
    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
-# Create static library
+if(WIN32)
-cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
+  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
+    analysis_config paddle_pass_builder)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
+else(WIN32)
+  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
+    analysis_config paddle_pass_builder)
+endif(WIN32)
 if(NOT APPLE)
  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@@ -38,11 +58,20 @@ if(NOT APPLE)
 endif()
 # Create shared library
-cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+if(WIN32)
-    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
+  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+          DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
+  target_link_libraries(paddle_fluid_shared shlwapi)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
+else(WIN32)
+  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+      DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
+endif()
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
-if(NOT APPLE)
+if(NOT APPLE AND NOT WIN32)
  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
  set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace inference {
@@ -124,20 +125,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
  return *var->GetMutable<T>();
 }
-static void ExecShellCommand(const std::string &cmd, std::string *message) {
-  char buffer[128];
-  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
-  if (!pipe) {
-    LOG(ERROR) << "error running command: " << cmd;
-    return;
-  }
-  while (!feof(pipe.get())) {
-    if (fgets(buffer, 128, pipe.get()) != nullptr) {
-      *message += buffer;
-    }
-  }
-}
 static framework::proto::ProgramDesc LoadProgramDesc(
    const std::string &model_path) {
  std::ifstream fin(model_path, std::ios::in | std::ios::binary);

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -412,7 +412,7 @@ void DetachDeletedNodes(framework::ir::Graph *graph) {
 void SubGraphFuser::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
-    if (subgraph.size() <= min_subgraph_size_) continue;
+    if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
    LOG(INFO) << "detect a subgraph size " << subgraph.size();
    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // it is either an OP's input or an OP's output.
  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (int index = 0; index < block_desc.OpSize(); index++) {
+  for (size_t index = 0; index < block_desc.OpSize(); index++) {
    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
    auto correspond_node = subgraph_nodes[index];
    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
      std::unordered_set<std::string> teller_set(
          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split"});
+           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
      if (!node->IsOp()) return false;
      if (teller_set.count(node->Op()->Type())) {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -549,4 +549,6 @@ USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
 USE_TRT_CONVERTER(pad);
 USE_TRT_CONVERTER(split);
+USE_TRT_CONVERTER(prelu);
+USE_TRT_CONVERTER(conv2d_transpose);
 #endif
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <thread>
+#include <thread>  // NOLINT
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"

--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>  //NOLINT
-#include "utils.h"
+#include "utils.h"  // NOLINT
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");

--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,9 +15,14 @@
 #pragma once
 #include <glog/logging.h>
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#endif
 #include <algorithm>
 #include <chrono>  // NOLINT
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig {
  void EnableTensorRtEngine(int workspace_size = 1 << 20,
                            int max_batch_size = 1);
+  bool use_tensorrt() const { return use_tensorrt_; }
  // NOTE this is just for internal development, please not use it.
  // NOT stable yet.
  void EnableMKLDNN();

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -86,12 +86,13 @@ class CpuPassStrategy : public PassStrategy {
        "fc_fuse_pass",                  //
        "conv_bn_fuse_pass",             //
        "conv_eltwiseadd_bn_fuse_pass",  //
+        "is_test_pass",                  //
    });
  }
  virtual ~CpuPassStrategy() = default;
-  virtual void EnableMKLDNN() override {
+  void EnableMKLDNN() override {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_MKLDNN
    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
@@ -123,7 +124,7 @@ class GpuPassStrategy : public PassStrategy {
  GpuPassStrategy(const GpuPassStrategy &other)
      : PassStrategy(other.AllPasses()) {}
-  virtual void EnableMKLDNN() override;
+  void EnableMKLDNN() override;
  virtual ~GpuPassStrategy() = default;
 };

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -2,7 +2,7 @@
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc
+pad_op.cc split_op.cc prelu_op.cc
  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -16,7 +16,7 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op conv_transpose_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op  tensorrt_plugin SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
@@ -33,4 +33,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
 nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
-split_op concat_op SERIAL)
+        split_op concat_op SERIAL)
+nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
+        prelu_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -18,92 +18,139 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
-bool to_skip_merging_optimize(TensorRTEngine* engine_,
+bool to_skip_merging_optimize(TensorRTEngine* engine,
                              const std::vector<int>& filters,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              std::string input_name) {
-  if (engine_->itensor_quote_num[input_name] > 0) {
+  if (engine->itensor_quote_num[input_name] > 0) {
    return true;
  }
  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
-    engine_->itensor_quote_num[input_name] += 1;
+    engine->itensor_quote_num[input_name] += 1;
  return false;
 }
+template <typename RegistFunc, typename SetDilationFunc>
+void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
+                   const framework::Scope& scope, bool test_mode,
+                   RegistFunc fadd_layer, SetDilationFunc fset_dilation,
+                   const std::string& name) {
+  VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+  PADDLE_ENFORCE(engine != nullptr);
+  auto* X = engine->GetITensor(op_desc.Input("Input").front());
+  // Declare weights
+  auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(Y_v);
+  auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+  platform::CPUPlace cpu_place;
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(Y_t->dims());
+  TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+  auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  const int n_output = weight_tensor->dims()[0];
+  const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  const std::vector<int> dilations =
+      boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  const std::vector<int> strides =
+      boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  const std::vector<int> paddings =
+      boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+  nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+  nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+  nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+  TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                static_cast<void*>(weight_data),
+                                static_cast<size_t>(weight_tensor->numel())};
+  TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
+                           nv_ksize, weight, bias);
+  PADDLE_ENFORCE(layer != nullptr);
+  layer->setStride(nv_strides);
+  layer->setPadding(nv_paddings);
+  layer->setNbGroups(groups);
+  // set dilations
+  fset_dilation(layer, nv_dilations);
+  auto output_name = op_desc.Output("Output").front();
+  layer->setName((name + " (Output: " + output_name + ")").c_str());
+  engine->weight_map[op_desc.Input("Filter").front()] =
+      std::move(weight_tensor);
+  layer->getOutput(0)->setName(output_name.c_str());
+  engine->SetITensor(output_name, layer->getOutput(0));
+  if (test_mode ||
+      to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
+                               op_desc.Input("Input").front())) {
+    engine->DeclareOutput(output_name);
+  }
+}
 class Conv2dOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias";
+    ConvertConv2d(
+        engine_, op, scope, test_mode,
-    framework::OpDesc op_desc(op, nullptr);
+        [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
-    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+            int n_input,                             /* Conv input maps */
-    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
-    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+            TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
+          auto* layer =
-    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+              TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                   ksize, weight.get(), bias.get());
-    // Declare weights
+          return layer;
-    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+        },
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
+        [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+          layer->setDilation(dilations);
+        },
-    platform::CPUPlace cpu_place;
+        "conv2d");
-    std::unique_ptr<framework::LoDTensor> weight_tensor(
+  }
-        new framework::LoDTensor());
+};
-    weight_tensor->Resize(Y_t->dims());
-    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+class Deconv2dOpConverter : public OpConverter {
+ public:
-    auto* weight_data =
+  void operator()(const framework::proto::OpDesc& op,
-        weight_tensor->mutable_data<float>(platform::CPUPlace());
+                  const framework::Scope& scope, bool test_mode) override {
+    ConvertConv2d(
-    PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+        engine_, op, scope, test_mode,
-    const int n_output = weight_tensor->dims()[0];
+        [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
-    const int filter_h = weight_tensor->dims()[2];
+            int n_input,                             /* Deconv output maps */
-    const int filter_w = weight_tensor->dims()[3];
+            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
+            TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
-    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+          auto* layer =
-    const std::vector<int> dilations =
+              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input,
-        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+                                   ksize, weight.get(), bias.get());
-    const std::vector<int> strides =
+          return layer;
-        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+        },
-    const std::vector<int> paddings =
+        [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+          PADDLE_ENFORCE(
+              dilations.d[0] == 1 && dilations.d[1] == 1,
-    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+              "Dilations must be (1, 1) for tensorRT, but given (%d, %d)",
-    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+              dilations.d[0], dilations.d[1]);
-    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+        },
-    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+        "conv2d_transpose");
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                  static_cast<void*>(weight_data),
-                                  weight_tensor->memory_size() / sizeof(float)};
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
-        nv_ksize, weight.get(), bias.get());
-    PADDLE_ENFORCE(layer != nullptr);
-    layer->setStride(nv_strides);
-    layer->setPadding(nv_paddings);
-    layer->setDilation(nv_dilations);
-    layer->setNbGroups(groups);
-    auto output_name = op_desc.Output("Output").front();
-    layer->setName(("conv2d (Output: " + output_name + ")").c_str());
-    engine_->weight_map[op_desc.Input("Filter").front()] =
-        std::move(weight_tensor);
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode ||
-        to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
-                                 paddings, op_desc.Input("Input").front())) {
-      engine_->DeclareOutput(output_name);
-    }
  }
 };
@@ -112,3 +159,4 @@ class Conv2dOpConverter : public OpConverter {
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
+REGISTER_TRT_OP_CONVERTER(conv2d_transpose, Deconv2dOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -34,7 +34,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
    auto* X = engine_->GetITensor(op_desc.Input("X").front());
    nvinfer1::Dims dims_x = X->getDimensions();
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    PADDLE_ENFORCE(dims_x.nbDims >= 3, "x dims experts 3, but %d is given.",
+                   dims_x.nbDims);
    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
    PADDLE_ENFORCE_NOT_NULL(Y_v);

--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * PRelu converter from fluid to tensorRT.
+ */
+class PReluOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid prelu op to tensorrt prelu layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    PADDLE_ENFORCE(input_num == 1);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    // Get output
+    size_t output_num = op_desc.Output("Out").size();
+    PADDLE_ENFORCE(output_num == 1);
+    // Get attrs
+    std::string mode = boost::get<std::string>(op_desc.GetAttr("mode"));
+    //
+    auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]);
+    PADDLE_ENFORCE_NOT_NULL(alpha_var);
+    auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
+    platform::CUDAPlace place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
+        new framework::LoDTensor());
+    alpha_tensor_device->Resize(alpha_tensor->dims());
+    TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
+    float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
+    // Transform alpha to TensorRTEngine::Weight
+    TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
+                                    static_cast<void*>(alpha_data),
+                                    alpha_tensor_device->numel());
+    PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode);
+    nvinfer1::IPluginLayer* layer =
+        engine_->AddPlugin(&input, input_num, plugin);
+    // keep alpha tensor to avoid release it's memory
+    engine_->weight_map[op_desc.Input("Alpha")[0]] =
+        std::move(alpha_tensor_device);
+    std::string layer_name = "prelu (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(prelu, PReluOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -26,7 +26,7 @@ class SplitOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid split op to tensorrt split layer";
+    VLOG(4) << "convert a fluid split op to tensorrt split layer";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+USE_OP(conv2d);
+USE_OP(conv2d_transpose);
 namespace paddle {
 namespace inference {
 namespace tensorrt {
@@ -51,7 +54,37 @@ TEST(conv2d_op, test) {
  validator.Execute(3);
 }
+TEST(conv2d_transpose_op, test) {
+  std::unordered_set<std::string> parameters({"deconv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+  validator.DeclInputVar("deconv2d-X", nvinfer1::Dims3(3, 5, 5));
+  validator.DeclParamVar("deconv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("deconv2d-Out", nvinfer1::Dims3(2, 5, 5));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d_transpose");
+  desc.SetInput("Input", {"deconv2d-X"});
+  desc.SetInput("Filter", {"deconv2d-Y"});
+  desc.SetOutput("Output", {"deconv2d-Out"});
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(3);
+}
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
-USE_OP(conv2d);
--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+TEST(prelu_op, test_channel_wise) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(3, 1, 1));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("channel"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+TEST(prelu_op, test_element_wise) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims4(10, 3, 2, 2));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("element"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+TEST(prelu_op, test_scalar) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(1, 1, 1));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+  desc.SetAttr("mode", std::string("all"));
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+// USE_OP(prelu);
+USE_CPU_ONLY_OP(prelu);
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -200,7 +200,8 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
 Buffer &TensorRTEngine::buffer(const std::string &name) {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
+                 name);
  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
  return buffers_[slot_offset];
 }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -40,6 +40,7 @@ class TensorRTEngine : public EngineBase {
  // Weight is model parameter.
  class Weight {
   public:
+    Weight() = default;
    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
      w_.type = dtype;
      w_.values = value;

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu
-avg_pool_op_plugin.cu DEPS enforce pooling)
+prelu_op_plugin.cu avg_pool_op_plugin.cu DEPS enforce pooling)
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stdio.h>
+#include <cassert>
+#include "glog/logging.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+static const int CUDA_NUM_THREADS = 1024;
+static const int CUDA_MAX_NUM_BLOCKS = 65535;
+inline static int GET_NUM_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
+                                       float *output, int channel,
+                                       size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  float *out = output + offset;
+  float scale = alpha[blockIdx.x % channel];
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
+                                       float *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  const float *scale = alpha + offset;
+  float *out = output + offset;
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale[i] * x;
+  }
+}
+__global__ void PReluScalarKernel(const float *input, const float *alpha,
+                                  float *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  float scale = *alpha;
+  float *out = output + offset;
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+static inline void PReluChannelWise(cudaStream_t stream, const float *input,
+                                    const float *alpha, float *output,
+                                    int batch_size,
+                                    const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, dims.d[0], spatial_size);
+}
+static inline void PReluElementWise(cudaStream_t stream, const float *input,
+                                    const float *alpha, float *output,
+                                    int batch_size,
+                                    const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+static inline void PReluScalar(cudaStream_t stream, const float *input,
+                               const float *alpha, float *output,
+                               int batch_size, const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputDims,
+                                                int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index < this->getNbOutputs());
+  nvinfer1::Dims const &input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  return output_dims;
+}
+int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  // input dims is CHW.
+  const auto &input_dims = this->getInputDims(0);
+  const float *input = reinterpret_cast<const float *>(inputs[0]);
+  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  float *output = reinterpret_cast<float **>(outputs)[0];
+  if (mode_ == "channel") {
+    PReluChannelWise(stream, input, alpha, output, batchSize, input_dims);
+  } else if (mode_ == "element") {
+    PReluElementWise(stream, input, alpha, output, batchSize, input_dims);
+  } else {
+    PReluScalar(stream, input, alpha, output, batchSize, input_dims);
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class PReluPlugin : public PluginTensorRT {
+  TensorRTEngine::Weight alpha_;
+  std::string mode_;
+ protected:
+  size_t getSerializationSize() override {
+    // return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
+    return 0;
+  }
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  void serialize(void *buffer) override {
+    // serializeBase(buffer);
+    // SerializeValue(&buffer, alpha_);
+    // SerializeValue(&buffer, mode_);
+  }
+ public:
+  PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
+      : alpha_(alpha), mode_(mode) {}
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  PReluPlugin(void const *serialData, size_t serialLength) {
+    // deserializeBase(serialData, serialLength);
+    // DeserializeValue(&serialData, &serialLength, &alpha_);
+    // DeserializeValue(&serialData, &serialLength, &mode_);
+  }
+  PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
+  const char *getPluginType() const override { return "prelu"; }
+  int getNbOutputs() const override { return 1; }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS
-        --infer_model=${DAM_INSTALL_DIR}/model
-        --infer_data=${DAM_INSTALL_DIR}/data.txt
-        --use_analysis=0)
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
@@ -82,6 +78,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
+# mobilenet with depthwise_conv op
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet 
+  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
   # anakin rnn1
@@ -108,8 +108,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
   endif()
   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
-        ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
+        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -69,7 +69,7 @@ struct DataRecord {
      num_lines++;
      std::vector<std::string> data;
      split(line, ',', &data);
-      CHECK_EQ(data.size(), 2 * MAX_TURN_NUM + 3);
+      CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
      // load turn data
      std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
      for (int i = 0; i < MAX_TURN_NUM; ++i) {
@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) {
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    PADDLE_ENFORCE_GT(outputs.size(), 0);
@@ -196,15 +197,13 @@ TEST(Analyzer_dam, fuse_statis) {
  contrib::AnalysisConfig cfg;
  SetConfig(&cfg);
-  if (FLAGS_use_analysis) {
+  int num_ops;
-    int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-    auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
-    auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-        static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
+  EXPECT_EQ(num_ops, 2020);
-    EXPECT_EQ(num_ops, 2020);
-  }
 }
 // Compare result of NativeConfig and AnalysisConfig
@@ -215,9 +214,8 @@ TEST(Analyzer_dam, compare) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  if (FLAGS_use_analysis) {
+  CompareNativeAndAnalysis(
-    CompareNativeAndAnalysis(cfg, input_slots_all);
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-  }
 }
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 }
 TEST(Analyzer_resnet50, profile) { profile(); }
@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 TEST(Analyzer_resnet50, compare) { compare(); }

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  LOG(INFO) << "to test prediction";
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+                 input_slots_all, &outputs, FLAGS_num_threads);
 }
 // Check the fuse status
@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 // Test Multi-Thread.
@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, 4 /* multi_thread */);
 }
 // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing

--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1) {
    // Get output
@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) {
  cfg->specify_input_name = true;
  // TODO(TJ): fix fusion gru
  cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
-#ifdef PADDLE_WITH_MKLDNN
-  cfg->EnableMKLDNN();
-#endif
 }
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -94,7 +91,8 @@ void profile(bool use_mkldnn = false) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    const float ocr_result_data[] = {
@@ -136,7 +134,8 @@ void compare(bool use_mkldnn = false) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 TEST(Analyzer_vis, compare) { compare(); }

--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <ostream>
+#include <string>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+namespace paddle {
+namespace inference {
+thread_local int num_spaces = 0;
+static std::string GenSpaces(int num_spaces) {
+  std::ostringstream os;
+  for (int i = 0; i < num_spaces; ++i) {
+    os << "  ";
+  }
+  return os.str();
+}
+std::ostream &operator<<(std::ostream &os,
+                         const PaddlePredictor::Config &config) {
+  os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n";
+  num_spaces++;
+  os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
+  os << GenSpaces(num_spaces) << "NativeConfig {\n";
+  num_spaces++;
+  os << *reinterpret_cast<const PaddlePredictor::Config *>(&config);
+  os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n";
+  os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
+  os << GenSpaces(num_spaces)
+     << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
+  os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
+  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+  os << GenSpaces(num_spaces)
+     << "specify_input_name: " << config.specify_input_name << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+std::ostream &operator<<(std::ostream &os,
+                         const contrib::AnalysisConfig &config) {
+  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
+  num_spaces++;
+  os << *reinterpret_cast<const NativeConfig *>(&config);
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
+     << "\n";
+  os << GenSpaces(num_spaces)
+     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
+  os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
+     << "\n";
+  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -19,13 +19,16 @@
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 DEFINE_bool(use_analysis, true,
            "Running the inference program in analysis mode.");
+DECLARE_bool(profile);
 namespace paddle {
 namespace inference {
-using contrib::AnalysisConfig;
+void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
+  if (use_analysis) {
+    LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
+    return;
+  }
+  LOG(INFO) << *config;
+}
 void CompareResult(const std::vector<PaddleTensor> &outputs,
                   const std::vector<PaddleTensor> &ref_outputs) {
@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 }
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
-    const AnalysisConfig &config, bool use_analysis = true) {
+    const PaddlePredictor::Config *config, bool use_analysis = true) {
  if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(
-  } else {
+        *(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
-    return CreatePaddlePredictor<NativeConfig>(config);
  }
+  return CreatePaddlePredictor<NativeConfig>(
+      *(reinterpret_cast<const NativeConfig *>(config)));
 }
 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
 }
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
-                       const std::string &dirname) {
+                       const std::string &dirname, bool is_combined = true,
+                       std::string model_filename = "model",
+                       std::string params_filename = "params") {
  // Set fake_image_data
  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
-  std::vector<std::vector<int64_t>> feed_target_shapes =
+  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
-      GetFeedTargetShapes(dirname, true, "model", "params");
+      dirname, is_combined, model_filename, params_filename);
+  std::ostringstream os;
+  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
+    os << "feed target " << i << ": {" << feed_target_shapes[i][0];
+    for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) {
+      os << ", " << feed_target_shapes[i][j];
+    }
+    os << "}\n";
+  }
+  LOG(INFO) << os.str();
  int dim1 = feed_target_shapes[0][1];
  int dim2 = feed_target_shapes[0][2];
  int dim3 = feed_target_shapes[0][3];
@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
 }
 void TestOneThreadPrediction(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
  auto predictor = CreateTestPredictor(config, use_analysis);
-  Timer timer;
-  timer.tic();
+  // warmup run
-  for (int i = 0; i < num_times; i++) {
+  LOG(INFO) << "Warm up run...";
-    for (size_t j = 0; j < inputs.size(); j++) {
+  {
-      predictor->Run(inputs[j], outputs);
+    Timer warmup_timer;
+    warmup_timer.tic();
+    predictor->Run(inputs[0], outputs, batch_size);
+    PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
+#if !defined(_WIN32)
+    if (FLAGS_profile) {
+      paddle::platform::ResetProfiler();
+    }
+#endif
+  }
+  LOG(INFO) << "Run " << num_times << " times...";
+  {
+    Timer run_timer;
+    run_timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      for (size_t j = 0; j < inputs.size(); j++) {
+        predictor->Run(inputs[j], outputs, batch_size);
+      }
    }
+    PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times,
+              inputs.size());
  }
-  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
-            inputs.size());
 }
 void TestMultiThreadPrediction(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
    std::vector<PaddleTensor> *outputs, int num_threads,
    bool use_analysis = true) {
@@ -200,12 +242,11 @@ void TestMultiThreadPrediction(
  }
 }
-void TestPrediction(const AnalysisConfig &config,
+void TestPrediction(const PaddlePredictor::Config *config,
                    const std::vector<std::vector<PaddleTensor>> &inputs,
                    std::vector<PaddleTensor> *outputs, int num_threads,
                    bool use_analysis = FLAGS_use_analysis) {
-  LOG(INFO) << "use_analysis: " << use_analysis
+  PrintConfig(config, use_analysis);
-            << ", use_mkldnn: " << config.use_mkldnn();
  if (num_threads == 1) {
    TestOneThreadPrediction(config, inputs, outputs, use_analysis);
  } else {
@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config,
 }
 void CompareNativeAndAnalysis(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs) {
-  LOG(INFO) << "use_mkldnn: " << config.use_mkldnn();
+  PrintConfig(config, true);
  std::vector<PaddleTensor> native_outputs, analysis_outputs;
  TestOneThreadPrediction(config, inputs, &native_outputs, false);
  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-// limitations under the License.
+limitations under the License. */
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 namespace paddle {
-using paddle::contrib::AnalysisConfig;
+namespace inference {
-DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
+DEFINE_string(prog_filename, "", "Name of model file.");
-NativeConfig GetConfigNative() {
+DEFINE_string(param_filename, "", "Name of parameters file.");
-  NativeConfig config;
-  config.model_dir = FLAGS_dirname;
+template <typename ConfigType>
-  // LOG(INFO) << "dirname  " << config.model_dir;
+void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
-  config.fraction_of_gpu_memory = 0.15;
+               bool use_tensorrt = false, int batch_size = -1) {
-  config.use_gpu = true;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-  config.device = 0;
+    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
-  return config;
+    config->param_file = model_dir + "/" + FLAGS_param_filename;
-}
+  } else {
+    config->model_dir = model_dir;
-void PrepareTRTConfig(AnalysisConfig *config) {
+  }
-  config->model_dir = FLAGS_dirname + "/" + "mobilenet";
+  if (use_gpu) {
-  config->fraction_of_gpu_memory = 0.15;
+    config->use_gpu = true;
-  config->EnableTensorRtEngine(1 << 10, 5);
+    config->device = 0;
-  config->pass_builder()->DeletePass("conv_bn_fuse_pass");
+    config->fraction_of_gpu_memory = 0.15;
-  config->pass_builder()->DeletePass("fc_fuse_pass");
+  }
-  config->pass_builder()->TurnOnDebug();
 }
-void PrepareInputs(std::vector<PaddleTensor> *tensors, int batch_size) {
+template <>
-  PADDLE_ENFORCE_EQ(tensors->size(), 1UL);
+void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
-  auto &tensor = tensors->front();
+                                        std::string model_dir, bool use_gpu,
-  int height = 224;
+                                        bool use_tensorrt, int batch_size) {
-  int width = 224;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-  float *data = new float[batch_size * 3 * height * width];
+    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
-  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
+    config->param_file = model_dir + "/" + FLAGS_param_filename;
-  data[0] = 1.0f;
+  } else {
+    config->model_dir = model_dir;
-  // Prepare inputs
+  }
-  tensor.name = "input_0";
+  if (use_gpu) {
-  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+    config->use_gpu = true;
-  tensor.data = PaddleBuf(static_cast<void *>(data),
+    config->device = 0;
-                          sizeof(float) * (batch_size * 3 * height * width));
+    config->fraction_of_gpu_memory = 0.15;
-  tensor.dtype = PaddleDType::FLOAT32;
+    if (use_tensorrt) {
+      config->EnableTensorRtEngine(1 << 10, batch_size);
+      config->pass_builder()->DeletePass("conv_bn_fuse_pass");
+      config->pass_builder()->DeletePass("fc_fuse_pass");
+      config->pass_builder()->TurnOnDebug();
+    } else {
+      config->enable_ir_optim = true;
+    }
+  }
 }
-void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
+void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
-  auto config0 = GetConfigNative();
+  std::vector<std::vector<PaddleTensor>> inputs_all;
-  config0.model_dir = model_dirname;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
-  AnalysisConfig config1(true);
+                      FLAGS_param_filename);
-  PrepareTRTConfig(&config1);
+  } else {
-  config1.model_dir = model_dirname;
+    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor(config1);
-  // Prepare inputs
-  std::vector<PaddleTensor> paddle_tensor_feeds(1);
-  PrepareInputs(&paddle_tensor_feeds, batch_size);
-  // Prepare outputs
-  std::vector<PaddleTensor> outputs0;
-  std::vector<PaddleTensor> outputs1;
-  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
-  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
-  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-  EXPECT_EQ(num_elements, num_elements1);
-  auto *data0 = static_cast<float *>(outputs0.front().data.data());
-  auto *data1 = static_cast<float *>(outputs1.front().data.data());
-  ASSERT_GT(num_elements, 0UL);
-  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
-    EXPECT_NEAR(data0[i], data1[i], 1e-3);
  }
-}
-TEST(trt_models_test, mobilenet) {
+  std::vector<PaddleTensor> outputs;
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "mobilenet");
+  if (use_analysis || use_tensorrt) {
-}
+    contrib::AnalysisConfig config(true);
-TEST(trt_models_test, resnet50) {
+    SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnet50");
+                                       FLAGS_batch_size);
-}
+    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
-TEST(trt_models_test, resnext50) {
+                   inputs_all, &outputs, FLAGS_num_threads, true);
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnext50");
+  } else {
+    NativeConfig config;
+    SetConfig<NativeConfig>(&config, model_dir, true, false);
+    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
+                   inputs_all, &outputs, FLAGS_num_threads, false);
+  }
 }
-TEST(trt_models_test, raw_gpu) {
+void compare(std::string model_dir, bool use_tensorrt) {
-  std::string model_dir = FLAGS_dirname + "/" + "mobilenet";
+  std::vector<std::vector<PaddleTensor>> inputs_all;
-  auto config0 = GetConfigNative();
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-  config0.model_dir = model_dir;
+    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
-  int batch_size = 2;
+                      FLAGS_param_filename);
+  } else {
-  AnalysisConfig config1(true);
+    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
-  config1.fraction_of_gpu_memory = 0.1;
+  }
-  config1.enable_ir_optim = true;
-  config1.model_dir = model_dir;
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
+  std::vector<PaddleTensor> native_outputs;
-  auto predictor1 = CreatePaddlePredictor(config1);
+  NativeConfig native_config;
+  SetConfig<NativeConfig>(&native_config, model_dir, true, false,
+                          FLAGS_batch_size);
+  TestOneThreadPrediction(
+      reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
+      &native_outputs, false);
+  std::vector<PaddleTensor> analysis_outputs;
+  contrib::AnalysisConfig analysis_config(true);
+  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
+                                     use_tensorrt, FLAGS_batch_size);
+  TestOneThreadPrediction(
+      reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all,
+      &analysis_outputs, true);
+  CompareResult(native_outputs, analysis_outputs);
+}
-  // Prepare inputs
+TEST(TensorRT_mobilenet, compare) {
-  std::vector<PaddleTensor> paddle_tensor_feeds(1);
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
-  PrepareInputs(&paddle_tensor_feeds, batch_size);
+  compare(model_dir, /* use_tensorrt */ true);
+}
-  // Prepare outputs
+TEST(TensorRT_resnet50, compare) {
-  std::vector<PaddleTensor> outputs0;
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
-  std::vector<PaddleTensor> outputs1;
+  compare(model_dir, /* use_tensorrt */ true);
-  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
+}
-  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
-  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+TEST(TensorRT_resnext50, compare) {
-  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
-  EXPECT_EQ(num_elements, num_elements1);
+  compare(model_dir, /* use_tensorrt */ true);
+}
-  auto *data0 = static_cast<float *>(outputs0.front().data.data());
+TEST(TensorRT_resnext50, profile) {
-  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
+}
-  ASSERT_GT(num_elements, 0UL);
+TEST(TensorRT_mobilenet, analysis) {
-  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+  compare(model_dir, /* use_tensorrt */ false);
-  }
 }
+}  // namespace inference
 }  // namespace paddle
 USE_PASS(tensorrt_subgraph_pass);
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -94,7 +94,8 @@ function(op_library TARGET)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op"
+            "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()
@@ -299,7 +300,6 @@ if (NOT WIN32)
    op_library(gru_op DEPS sequence2batch gru_compute)
 endif(NOT WIN32)
 op_library(recurrent_op DEPS executor)
-op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(unsqueeze_op DEPS reshape_op)
@@ -308,8 +308,10 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
+if (NOT WIN32)
 op_library(crf_decoding_op DEPS jit_kernel)
 op_library(fusion_lstm_op DEPS jit_kernel)
+endif(NOT WIN32)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
    op_library(layer_norm_op DEPS cub)
@@ -325,8 +327,16 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(tensor_array_to_tensor_op DEPS concat_op)
 op_library(concat_op DEPS concat_and_split)
+op_library(tensor_array_to_tensor_op DEPS concat_op)
+set(DEPS_OPS ${DEPS_OPS} warpctc_op)
+if (WITH_GPU)
+    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
+    endif()
+endif()
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})

--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -71,6 +71,10 @@ class MKLDNNActivationGradKernel
                       diff_y->format() != memory::format::format_undef,
                   "Wrong layout/format set for Input OutGrad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    Functor functor;
    auto attrs = functor.GetAttrs();
@@ -115,11 +119,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+  bool is_test = ctx.Attr<bool>("is_test");
  // save input data and layout to be referred in backward path
  auto p_src_data = std::make_shared<const T *>(x_data);
-  dev_ctx.SetBlob(key_src_data, p_src_data);
  auto p_src_layout = std::make_shared<memory::format>(src_format);
-  dev_ctx.SetBlob(key_src_layout, p_src_layout);
+  if (!is_test) {
+    dev_ctx.SetBlob(key_src_data, p_src_data);
+    dev_ctx.SetBlob(key_src_layout, p_src_layout);
+  }
  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
      dev_ctx.GetBlob(key_fwd));
@@ -136,14 +144,17 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
    dev_ctx.SetBlob(key_src_mem, src_memory);
    // create primitive descriptor for activation forward and save it
+    auto mkldnn_forward_prop_kind = is_test
+                                        ? mkldnn::prop_kind::forward_inference
+                                        : mkldnn::prop_kind::forward_training;
    auto forward_desc = mkldnn::eltwise_forward::desc(
-        mkldnn::prop_kind::forward_training, algorithm,
+        mkldnn_forward_prop_kind, algorithm,
        src_memory->get_primitive_desc().desc(), alpha, beta);
    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
        forward_desc, mkldnn_engine);
    // save prim desc into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
+    if (!is_test) dev_ctx.SetBlob(key_fwd_pd, forward_pd);
    // create mkldnn memory for output y
    dst_memory =

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -22,18 +22,23 @@ namespace operators {
 using paddle::framework::Tensor;
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                \
-  class OP_NAME##OpMaker                                                \
+  class OP_NAME##OpMaker                                                 \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {             \
-   public:                                                              \
+   public:                                                               \
-    void Make() override {                                              \
+    void Make() override {                                               \
-      AddInput("X", "Input of " #OP_NAME " operator");                  \
+      AddInput("X", "Input of " #OP_NAME " operator");                   \
-      AddOutput("Out", "Output of " #OP_NAME " operator");              \
+      AddOutput("Out", "Output of " #OP_NAME " operator");               \
-      AddAttr<bool>("use_mkldnn",                                       \
+      AddAttr<bool>("use_mkldnn",                                        \
-                    "(bool, default false) Only used in mkldnn kernel") \
+                    "(bool, default false) Only used in mkldnn kernel")  \
-          .SetDefault(false);                                           \
+          .SetDefault(false);                                            \
-      AddComment(#OP_COMMENT);                                          \
+      AddAttr<bool>(                                                     \
-    }                                                                   \
+          "is_test",                                                     \
+          "(bool, default false) Set to true for inference only, false " \
+          "for training. Some layers may run faster when this is true.") \
+          .SetDefault(false);                                            \
+      AddComment(#OP_COMMENT);                                           \
+    }                                                                    \
  }
 #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
@@ -269,7 +274,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 :strong:`Softshrink Activation Operator`
 ..  math::
-    out = \begin{cases} 
+    out = \begin{cases}
         x - \lambda, \text{if } x > \lambda \\
         x + \lambda, \text{if } x < -\lambda \\
         0,  \text{otherwise}
@@ -435,7 +440,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 HardSigmoid Activation Operator.
-Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
+Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
 which is much faster than sigmoid.
 $out = \max(0, \min(1, slope * x + shift))$

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -113,7 +113,10 @@ class BatchNormOp : public framework::OperatorWithKernel {
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<float>("momentum", "").SetDefault(0.9);
    AddAttr<float>("epsilon", "")
        .SetDefault(1e-5)

--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
    static_cast<size_t>(1024) * 1024 * 1024;
-static constexpr size_t kNUM_CUDNN_FWD_ALGS =
+#if CUDNN_VERSION_MIN(6, 0, 5)
-    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+#else
+// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
+#endif
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -383,20 +383,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    // create a conv primitive descriptor and save it for usage in backward
    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
+    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
+                                 : mkldnn::prop_kind::forward_training;
    if (bias) {
      bias_tz = paddle::framework::vectorize2int(bias->dims());
      auto bias_md = platform::MKLDNNMemDesc(
          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+      conv_pd = ConvFwdPrimitiveDesc(
-                                     strides, paddings, mkldnn_engine,
+          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-                                     fuse_relu, fuse_residual_conn);
+          fuse_relu, fuse_residual_conn, fwd_prop_kind);
    } else {
-      conv_pd =
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-          ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                                     paddings, mkldnn_engine, fuse_relu,
-                               mkldnn_engine, fuse_relu, fuse_residual_conn);
+                                     fuse_residual_conn, fwd_prop_kind);
    }
    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+    if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
@@ -510,14 +512,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const memory::desc& dst, const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn) const {
+                       const bool fuse_residual_conn,
+                       mkldnn::prop_kind fwd_prop_kind) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
-        dst, stride_dims, padding_dims, padding_dims,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-        mkldnn::padding_kind::zero);
    mkldnn::primitive_attr conv_attr =
        CreatePostOps(fuse_relu, fuse_residual_conn);
@@ -535,14 +537,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn) const {
+                       const bool fuse_residual_conn,
+                       mkldnn::prop_kind fwd_prop_kind) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
-        bias, dst, stride_dims, padding_dims, padding_dims,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-        mkldnn::padding_kind::zero);
    mkldnn::primitive_attr conv_attr =
        CreatePostOps(fuse_relu, fuse_residual_conn);
@@ -587,6 +589,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       output_grad->format() != memory::format::format_undef,
                   "Wrong layout/format set for output_grad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    if (!input_grad && !filter_grad) return;
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -109,7 +109,10 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 }
 void Conv2DOpMaker::Make() {
-  AddAttr<bool>("is_test", "").SetDefault(false);
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
  AddInput(
      "Input",
      "(Tensor) The input tensor of convolution operator. "

--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -15,6 +15,10 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using paddle::platform::float16;
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -49,7 +49,10 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
                         "'dropout_prob' must be between 0.0 and 1.0.");
        });
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<bool>("fix_seed",
                  "A flag indicating whether to use a fixed seed to generate "
                  "random mask. NOTE: DO NOT set this flag to true in "

--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -111,6 +111,17 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext>
    return *this;
  }
+  RowwiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
+    while (n-- > 0) {
+      ++i_;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+    return *this;
+  }
  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>
                      &rhs) const {
    return (ptr_ + i_) == &(*rhs);
@@ -149,6 +160,21 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext>
    return *this;
  }
+  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
+    while (n-- > 0) {
+      ++j_;
+      if (UNLIKELY(j_ == post_)) {
+        ++i_;
+        j_ = 0;
+        if (UNLIKELY(i_ == n_)) {
+          i_ = 0;
+        }
+      }
+    }
+    return *this;
+  }
  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
                      &rhs) const {
    return (ptr_ + i_) == &(*rhs);

--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -47,6 +47,11 @@ class ExpandOp : public framework::OperatorWithKernel {
      out_shape[i] = x_dims[i] * expand_times[i];
    }
+    // set the first dim to -1 in compile time
+    if (!ctx->IsRuntime()) {
+      out_shape[0] = x_dims[0];
+    }
    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
    if (out_shape[0] == x_dims[0]) {
      ctx->ShareLoD("X", "Out");
@@ -109,7 +114,16 @@ class ExpandGradOp : public framework::OperatorWithKernel {
        ctx->Attrs().Get<std::vector<int>>("expand_times");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    for (size_t i = 0; i < expand_times.size(); ++i) {
+    size_t start_pos = 0u;
+    if (!ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], out_dims[0],
+          "The first dimension size of Input(Out@GRAD) should be "
+          "equal to the crroresponding dimension size of Input(X)");
+      start_pos = 1u;
+    }
+    for (size_t i = start_pos; i < expand_times.size(); ++i) {
      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
                        "Each dimension size of Input(Out@GRAD) should be "
                        "equal to multiplication of crroresponding dimension "

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -138,7 +138,7 @@ class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 FakeQuantize operator
-$$scale = max(abs(X))$$ 
+$$scale = max(abs(X))$$
 $$range = 2^{bit_length - 1} - 1$$
 $$Out = round(X/scale * range)$$
@@ -199,11 +199,14 @@ class FakeQuantizeRangeAbsMaxOpMaker
          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
                         "'bit_length' should be between 1 and 16.");
        });
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddComment(R"DOC(
 FakeQuantize operator is used in static quantization.
-$$scale = max(max(abs(x)), history_abs_max)$$ 
+$$scale = max(max(abs(x)), history_abs_max)$$
 $$range = 2^{bit_length - 1} - 1$$
 $$Out = round(X/scale * range)$$

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Out(Output) of Fully Connected should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("W"),
                 "W(Input) of Fully Connected should not be null.");
-  // NCHW
  auto in_dims = ctx->GetInputDim("Input");
-  // IO, I=C*H*W
  auto w_dims = ctx->GetInputDim("W");
-  std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
  if (ctx->HasInput("Bias")) {
    auto bias_dims = ctx->GetInputDim("Bias");
@@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
                        "The shape of Bias must be [1, dim].");
    }
  }
-  PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
-                 "Fully Connected input should be 2-D or 4-D tensor.");
+  if (ctx->Attrs().Get<bool>("use_mkldnn")) {
+    PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
+                   "Fully Connected input should be 2-D or 4-D tensor.");
+  }
  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
                    "Fully Connected input should be 2-D tensor.");
-  PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0],
+  int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
-                    "Fully Connected input and weigth size do not match.");
+  PADDLE_ENFORCE_GT(
+      in_dims.size(), in_num_col_dims,
+      "The input tensor Input's rank of FCOp should be larger than "
+      "in_num_col_dims.");
+  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
+  PADDLE_ENFORCE_EQ(
+      in_mat_dims[1], w_dims[0],
+      "Fully Connected input and weigth size do not match. %s, %s");
+  std::vector<int64_t> output_dims;
+  output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
+  for (int i = 0; i < in_num_col_dims; ++i) {
+    output_dims.push_back(in_dims[i]);
+  }
+  output_dims.push_back(w_dims[1]);
-  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
  ctx->ShareLoD("Input", "Out");
 }
@@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
 }
 void FCOpMaker::Make() {
-  AddInput("Input",
+  AddInput("Input", "(Tensor), The input tensor of fully connected operator.");
-           "(Tensor), The input tensor of fully connected operator with format "
-           "(NCHW). ");
  AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
  AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
      .AsDispensable();
+  AddAttr<int>("in_num_col_dims",
+               "(int, default 1), The fc op can take tensors with more than "
+               "two dimensions as its inputs.")
+      .SetDefault(1)
+      .EqualGreaterThan(1);
  AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
@@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel<T> {
    auto output = ctx.Output<Tensor>("Out");
    auto in_dims = input->dims();
    auto w_dims = w->dims();
+    auto out_dims = output->dims();
+    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
    const T* input_data = input->data<T>();
    const T* w_data = w->data<T>();
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
    math::FCCompute<platform::CPUDeviceContext, T>(
-        blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
+        blas, M, w_dims[1], w_dims[0], input_data, w_data, output_data,
        bias ? bias->data<T>() : NULL);
    // TODO(TJ): fuse act

--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -63,7 +63,8 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
  Tensor ones;
  ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
-  Tensor half_xmax, half_ymax;
+  Tensor half_xmax;
+  Tensor half_ymax;
  half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
  auto half_xmax_t =
      EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);

--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel {
    std::vector<int64_t> out_dims;
    out_dims.reserve(dims.size() + 1);
    // copy all dims except the last one
-    for (size_t i = 0u; i != dims.size() - 1; ++i) {
+    for (int i = 0u; i != dims.size() - 1; ++i) {
      out_dims.emplace_back(dims[i]);
    }
    int num_hash = ctx->Attrs().Get<int>("num_hash");

--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -46,7 +46,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
    int pre_pad = (n - 1) / 2;
    // compute batches one by one
    for (int i = 0; i < N; ++i) {
-      blas.VSQR(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
+      blas.VSQUARE(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
      // init the first channel of mid
      for (int c = 0; c < n; ++c) {
        blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size);
@@ -229,8 +229,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
        "the input will be transformed automatically. ")
        .SetDefault("AnyLayout");
    AddAttr<bool>("is_test",
-                  "Turns on memory optimization that optimizes away "
+                  "(bool, default false) Set to true for inference only, false "
-                  "unnecessary memory allocations. Used by MKLDNN.")
+                  "for training. Some layers may run faster when this is true.")
        .SetDefault(false);
    AddComment(R"DOC(

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -75,12 +75,13 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
+if (NOT WIN32)
-set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
+    set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
-set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
+    set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-if(WITH_XBYAK)
+    if(WITH_XBYAK)
-    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
+        list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
-    list(APPEND JIT_KERNEL_DEPS xbyak)
+        list(APPEND JIT_KERNEL_DEPS xbyak)
-endif()
+    endif()
-cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+endif (NOT WIN32)
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -153,7 +153,7 @@ class Blas {
  void VEXP(int n, const T* x, T* y) const;
  template <typename T>
-  void VSQR(int n, const T* x, T* y) const;
+  void VSQUARE(int n, const T* x, T* y) const;
  template <typename T>
  void VPOW(int n, const T* x, T alpha, T* y) const;
@@ -245,8 +245,8 @@ class BlasT : private Blas<DeviceContext> {
  }
  template <typename... ARGS>
-  void VSQR(ARGS... args) const {
+  void VSQUARE(ARGS... args) const {
-    Base()->template VSQR<T>(args...);
+    Base()->template VSQUARE<T>(args...);
  }
  template <typename... ARGS>

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -105,7 +105,7 @@ struct CBlas<float> {
  }
  template <typename... ARGS>
-  static void VSQR(ARGS... args) {
+  static void VSQUARE(ARGS... args) {
    platform::dynload::vsSqr(args...);
  }
@@ -195,7 +195,7 @@ struct CBlas<double> {
  }
  template <typename... ARGS>
-  static void VSQR(ARGS... args) {
+  static void VSQUARE(ARGS... args) {
    platform::dynload::vdSqr(args...);
  }
@@ -262,7 +262,9 @@ struct CBlas<platform::float16> {
  }
  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQR(...) { PADDLE_THROW("float16 VSQR not supported on CPU"); }
+  static void VSQUARE(...) {
+    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+  }
  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
@@ -423,12 +425,12 @@ void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
 template <>
 template <typename T>
-void Blas<platform::CPUDeviceContext>::VSQR(int n, const T *x, T *y) const {
+void Blas<platform::CPUDeviceContext>::VSQUARE(int n, const T *x, T *y) const {
 #ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQR(n, x, y);
+  CBlas<T>::VSQUARE(n, x, y);
 #else
  for (int i = 0; i < n; ++i) {
-    y[i] = std::sqrt(x[i]);
+    y[i] = x[i] * x[i];
  }
 #endif
 }

--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -118,6 +118,39 @@ void VXXJitCode::generate() {
  ret();
 }
+bool ReluJitCode::init(int d) { return MayIUse(avx); }
+void ReluJitCode::generate() {
+  int offset = 0;
+  vxorps(ymm_zero, ymm_zero, ymm_zero);
+  for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
+    vmovups(ymm_src, ptr[param1 + offset]);
+    vmaxps(ymm_dst, ymm_zero, ymm_src);
+    vmovups(ptr[param2 + offset], ymm_dst);
+    offset += sizeof(float) * AVX_FLOAT_BLOCK;
+  }
+  int rest = num_ % AVX_FLOAT_BLOCK;
+  if (rest >= 4) {
+    vmovups(xmm_src, ptr[param1 + offset]);
+    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    vmovups(ptr[param2 + offset], xmm_dst);
+    offset += sizeof(float) * 4;
+    rest -= 4;
+  }
+  if (rest >= 2) {
+    vmovups(xmm_src, ptr[param1 + offset]);
+    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    vmovq(ptr[param2 + offset], xmm_dst);
+    offset += sizeof(float) * 2;
+    rest -= 2;
+  }
+  if (rest > 0) {
+    vmovups(xmm_src, ptr[param1 + offset]);
+    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    vmovss(ptr[param2 + offset], xmm_dst);
+  }
+  ret();
+}
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -85,6 +85,29 @@ class VXXJitCode : public JitCode {
  ymm_t ymm_zero = ymm_t(3);
 };
+class ReluJitCode : public JitCode {
+ public:
+  DECLARE_JIT_CODE(ReluJitCode);
+  explicit ReluJitCode(int d, size_t code_size = 256 * 1024,
+                       void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), num_(d) {}
+  static bool init(int d);
+  void generate() override;
+ private:
+  int num_;
+  reg64_t param1{abi_param1};
+  reg64_t param2{abi_param2};
+  xmm_t xmm_zero = xmm_t(0);
+  xmm_t xmm_src = xmm_t(1);
+  xmm_t xmm_dst = xmm_t(1);
+  ymm_t ymm_zero = ymm_t(0);
+  ymm_t ymm_src = ymm_t(1);
+  ymm_t ymm_dst = ymm_t(1);
+};
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -97,37 +97,38 @@ class VAddBiasKernel : public Kernel {
 template <typename T>
 class VActKernel : public Kernel {
 public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 template <typename T>
 class VReluKernel : public VActKernel<T> {
 public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
+  void (*Compute)(const T *, T *, int);
 };
 template <typename T>
 class VIdentityKernel : public VActKernel<T> {
 public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 template <typename T>
 class VExpKernel : public VActKernel<T> {
 public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 template <typename T>
 class VSigmoidKernel : public VActKernel<T> {
 public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 template <typename T>
 class VTanhKernel : public VActKernel<T> {
 public:
-  virtual void Compute(const T *x, T *y) const = 0;
+  virtual void ComputeDeprecated(const T *x, T *y) const = 0;
 };
 template <typename T>

--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -71,6 +71,13 @@ void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
  }
 }
+template <typename T>
+void VReluRefer(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
 #ifdef PADDLE_WITH_MKLML
 template <typename T>
 void VMulMKL(const T* x, const T* y, T* z, int n);
@@ -344,124 +351,60 @@ bool VAddBiasKernelImpl<float>::useJIT(int d) {
 }
 #endif
-#undef DECLARE_STATIC_FUNC
-REGISTER_JITKERNEL(vmul, VMulKernel);
-REGISTER_JITKERNEL(vadd, VAddKernel);
-REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
-REGISTER_JITKERNEL(vscal, VScalKernel);
-REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
 /* VRelu JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class VReluKernelImpl : public VReluKernel<T> {
 public:
-  explicit VReluKernelImpl(int d) : VReluKernel<T>() { this->num_ = d; }
+  DECLARE_STATIC_FUNC;
-  void Compute(const T* x, T* y) const override {
+  explicit VReluKernelImpl(int d) : VReluKernel<T>() {
-    for (int i = 0; i < this->num_; ++i) {
+    this->num_ = d;  // TODO(TJ): remove me when ComputeDeprecated done
-      y[i] = x[i] > 0 ? x[i] : 0;
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      size_t sz = 96 /*init*/ +
+                  d / AVX_FLOAT_BLOCK * 4 /* instructions*/ *
+                      8 /*everage byte for each instruction*/;
+      jitcode_.reset(new gen::ReluJitCode(d, sz > 4096 ? sz : 4096));
+      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
+      return;
    }
-  }
+#endif
-};
-#define INTRI8_FLOAT(isa)                                                   \
-  template <>                                                               \
-  void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
-      const {                                                               \
-    __m256 tmp = _mm256_loadu_ps(x);                                        \
-    tmp = _mm256_max_ps(tmp, _mm256_setzero_ps());                          \
-    _mm256_storeu_ps(y, tmp);                                               \
-  }
-#define INTRI16_FLOAT(isa)                                                   \
-  template <>                                                                \
-  void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
-      const {                                                                \
-    __m256 zeros = _mm256_setzero_ps();                                      \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                        \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                    \
-    tmp0 = _mm256_max_ps(tmp0, zeros);                                       \
-    tmp1 = _mm256_max_ps(tmp1, zeros);                                       \
-    _mm256_storeu_ps(y, tmp0);                                               \
-    _mm256_storeu_ps(y + 8, tmp1);                                           \
-  }
-#define INTRI_GT8LT16_FLOAT(isa)                                        \
+    this->Compute = VReluRefer<T>;
-  template <>                                                           \
-  VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d)         \
-      : VReluKernel<float>() {                                          \
-    this->num_ = d;                                                     \
-    this->end_ = AVX_FLOAT_BLOCK;                                       \
-    this->rest_ = d - AVX_FLOAT_BLOCK;                                  \
-  }                                                                     \
-  template <>                                                           \
-  void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,   \
-                                                      float* y) const { \
-    __m256 zeros = _mm256_setzero_ps();                                 \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                   \
-    __m256 tmp1 = _mm256_loadu_ps(x + this->rest_);                     \
-    tmp0 = _mm256_max_ps(tmp0, zeros);                                  \
-    tmp1 = _mm256_max_ps(tmp1, zeros);                                  \
-    _mm256_storeu_ps(y, tmp0);                                          \
-    _mm256_storeu_ps(y + this->rest_, tmp1);                            \
  }
+  void ComputeDeprecated(const T* x, T* y) const override {
-#define INTRI_GT16_FLOAT(isa)                                                \
+    VReluRefer(x, y, this->num_);
-  template <>                                                                \
-  VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d)                 \
-      : VReluKernel<float>() {                                               \
-    this->num_ = d;                                                          \
-    this->end_ = d - d % AVX_FLOAT_BLOCK;                                    \
-    this->rest_ = d - AVX_FLOAT_BLOCK;                                       \
-  }                                                                          \
-  template <>                                                                \
-  void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
-      const {                                                                \
-    __m256 zeros = _mm256_setzero_ps();                                      \
-    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
-      __m256 tmp = _mm256_loadu_ps(x + i);                                   \
-      tmp = _mm256_max_ps(tmp, zeros);                                       \
-      _mm256_storeu_ps(y + i, tmp);                                          \
-    }                                                                        \
-    __m256 tmp = _mm256_loadu_ps(x + this->rest_);                           \
-    tmp = _mm256_max_ps(tmp, zeros);                                         \
-    _mm256_storeu_ps(y + this->rest_, tmp);                                  \
  }
+#ifdef PADDLE_WITH_XBYAK
-#ifdef __AVX__
+ private:
-INTRI8_FLOAT(jit::avx);
+  std::unique_ptr<gen::ReluJitCode> jitcode_{nullptr};
-INTRI16_FLOAT(jit::avx);
-INTRI_GT8LT16_FLOAT(jit::avx);
-INTRI_GT16_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-INTRI16_FLOAT(jit::avx2);
-INTRI_GT8LT16_FLOAT(jit::avx2);
-INTRI_GT16_FLOAT(jit::avx2);
 #endif
-#ifdef __AVX512F__
+};
-// TODO(TJ): refine avx512
-INTRI8_FLOAT(jit::avx512f);
+#ifdef PADDLE_WITH_XBYAK
-INTRI16_FLOAT(jit::avx512f);
+template <>
-INTRI_GT8LT16_FLOAT(jit::avx512f);
+bool VReluKernelImpl<float>::useJIT(int d) {
-INTRI_GT16_FLOAT(jit::avx512f);
+  return gen::ReluJitCode::init(d);
+}
 #endif
-#undef INTRI8_FLOAT
+#undef DECLARE_STATIC_FUNC
-#undef INTRI16_FLOAT
-#undef INTRI_GT8LT16_FLOAT
+REGISTER_JITKERNEL(vmul, VMulKernel);
-#undef INTRI_GT16_FLOAT
+REGISTER_JITKERNEL(vadd, VAddKernel);
+REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
+REGISTER_JITKERNEL(vscal, VScalKernel);
+REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
+REGISTER_JITKERNEL(vrelu, VReluKernel);
 /* An empty JitKernel */
 template <typename T, platform::jit::cpu_isa_t isa, jit_block>
 class VIdentityKernelImpl : public VIdentityKernel<T> {
 public:
  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() { this->num_ = d; }
-  void Compute(const T* x, T* y) const override {}
+  void ComputeDeprecated(const T* x, T* y) const override {}
 };
-REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel);
 REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel);
 }  // namespace jitkernel

--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -35,7 +35,7 @@ template <typename T, jit::cpu_isa_t isa, jit_block>
 class VExpKernelImpl : public VExpKernel<T> {
 public:
  explicit VExpKernelImpl(int d) : VExpKernel<T>() { this->num_ = d; }
-  void Compute(const T* x, T* y) const override {
+  void ComputeDeprecated(const T* x, T* y) const override {
    for (int i = 0; i < this->num_; ++i) {
      y[i] = std::exp(x[i]);
    }
@@ -43,18 +43,18 @@ class VExpKernelImpl : public VExpKernel<T> {
 };
 #ifdef PADDLE_WITH_MKLML
-#define MKL_FLOAT(isa, block)                                               \
+#define MKL_FLOAT(isa, block)                                                 \
-  template <>                                                               \
+  template <>                                                                 \
-  void VExpKernelImpl<float, isa, block>::Compute(const float* x, float* y) \
+  void VExpKernelImpl<float, isa, block>::ComputeDeprecated(const float* x,   \
-      const {                                                               \
+                                                            float* y) const { \
-    platform::dynload::vsExp(this->num_, x, y);                             \
+    platform::dynload::vsExp(this->num_, x, y);                               \
  }
-#define MKL_DOUBLE(isa, block)                                                 \
+#define MKL_DOUBLE(isa, block)                                \
-  template <>                                                                  \
+  template <>                                                 \
-  void VExpKernelImpl<double, isa, block>::Compute(const double* x, double* y) \
+  void VExpKernelImpl<double, isa, block>::ComputeDeprecated( \
-      const {                                                                  \
+      const double* x, double* y) const {                     \
-    platform::dynload::vdExp(this->num_, x, y);                                \
+    platform::dynload::vdExp(this->num_, x, y);               \
  }
 FOR_EACH_ISA(MKL_FLOAT, kLT8);
 FOR_EACH_ISA(MKL_FLOAT, kGT8LT16);
@@ -211,24 +211,24 @@ __m256 ExpAVX2(__m256 x) {
 }  // namespace detail
-#define INTRI8_FLOAT(isa, expisa)                                          \
+#define INTRI8_FLOAT(isa, expisa)                                            \
-  template <>                                                              \
+  template <>                                                                \
-  void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+  void VExpKernelImpl<float, isa, kEQ8>::ComputeDeprecated(const float* x,   \
-      const {                                                              \
+                                                           float* y) const { \
-    __m256 tmp = _mm256_loadu_ps(x);                                       \
+    __m256 tmp = _mm256_loadu_ps(x);                                         \
-    _mm256_storeu_ps(y, expisa(tmp));                                      \
+    _mm256_storeu_ps(y, expisa(tmp));                                        \
  }
-#define INTRI16_FLOAT(isa, expisa)                                          \
+#define INTRI16_FLOAT(isa, expisa)                                            \
-  template <>                                                               \
+  template <>                                                                 \
-  void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
+  void VExpKernelImpl<float, isa, kEQ16>::ComputeDeprecated(const float* x,   \
-      const {                                                               \
+                                                            float* y) const { \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                       \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                         \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                   \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                     \
-    tmp0 = expisa(tmp0);                                                    \
+    tmp0 = expisa(tmp0);                                                      \
-    tmp1 = expisa(tmp1);                                                    \
+    tmp1 = expisa(tmp1);                                                      \
-    _mm256_storeu_ps(y, tmp0);                                              \
+    _mm256_storeu_ps(y, tmp0);                                                \
-    _mm256_storeu_ps(y + 8, tmp1);                                          \
+    _mm256_storeu_ps(y + 8, tmp1);                                            \
  }
 #ifdef __AVX__
@@ -260,14 +260,14 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
    this->num_ = d;
    vexp_ = KernelPool::Instance().template Get<VExpKernel<T>>(d);
  }
-  void Compute(const T* x, T* y) const override {
+  void ComputeDeprecated(const T* x, T* y) const override {
    const T min = SIGMOID_THRESHOLD_MIN;
    const T max = SIGMOID_THRESHOLD_MAX;
    for (int i = 0; i < this->num_; ++i) {
      y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
      y[i] = static_cast<T>(0) - y[i];
    }
-    vexp_->Compute(y, y);
+    vexp_->ComputeDeprecated(y, y);
    for (int i = 0; i < this->num_; ++i) {
      y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
    }
@@ -285,30 +285,30 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
-#define INTRI8_FLOAT(isa, expisa)                                              \
+#define INTRI8_FLOAT(isa, expisa)                               \
-  template <>                                                                  \
+  template <>                                                   \
-  void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+  void VSigmoidKernelImpl<float, isa, kEQ8>::ComputeDeprecated( \
-      const {                                                                  \
+      const float* x, float* y) const {                         \
-    /* TODO(TJ): try to use static const*/                                     \
+    /* TODO(TJ): try to use static const*/                      \
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                        \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);         \
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                        \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);         \
-    __m256 tmp = _mm256_loadu_ps(x);                                           \
+    __m256 tmp = _mm256_loadu_ps(x);                            \
-    INTRI_SIGMOID(tmp, min, max, expisa);                                      \
+    INTRI_SIGMOID(tmp, min, max, expisa);                       \
-    _mm256_storeu_ps(y, tmp);                                                  \
+    _mm256_storeu_ps(y, tmp);                                   \
  }
-#define INTRI16_FLOAT(isa, expisa)                                      \
+#define INTRI16_FLOAT(isa, expisa)                               \
-  template <>                                                           \
+  template <>                                                    \
-  void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x,   \
+  void VSigmoidKernelImpl<float, isa, kEQ16>::ComputeDeprecated( \
-                                                      float* y) const { \
+      const float* x, float* y) const {                          \
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                 \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);          \
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                 \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);          \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                   \
+    __m256 tmp0 = _mm256_loadu_ps(x);                            \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                               \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                        \
-    INTRI_SIGMOID(tmp0, min, max, expisa);                              \
+    INTRI_SIGMOID(tmp0, min, max, expisa);                       \
-    INTRI_SIGMOID(tmp1, min, max, expisa);                              \
+    INTRI_SIGMOID(tmp1, min, max, expisa);                       \
-    _mm256_storeu_ps(y, tmp0);                                          \
+    _mm256_storeu_ps(y, tmp0);                                   \
-    _mm256_storeu_ps(y + 8, tmp1);                                      \
+    _mm256_storeu_ps(y + 8, tmp1);                               \
  }
 #define INTRI_GT8LT16_FLOAT(isa, expisa)                                     \
@@ -322,8 +322,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
        KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
  }                                                                          \
  template <>                                                                \
-  void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,     \
+  void VSigmoidKernelImpl<float, isa, kGT8LT16>::ComputeDeprecated(          \
-                                                         float* y) const {   \
+      const float* x, float* y) const {                                      \
    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
    __m256 tmp = _mm256_loadu_ps(x);                                         \
@@ -335,7 +335,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
      y[i] = 0.f - y[i];                                                     \
    }                                                                        \
-    vexp_->Compute(y + this->end_, y + this->end_);                          \
+    vexp_->ComputeDeprecated(y + this->end_, y + this->end_);                \
    for (int i = this->end_; i < this->num_; ++i) {                          \
      y[i] = 1.f / (1.f + y[i]);                                             \
    }                                                                        \
@@ -352,8 +352,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
        KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
  }                                                                          \
  template <>                                                                \
-  void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x,        \
+  void VSigmoidKernelImpl<float, isa, kGT16>::ComputeDeprecated(             \
-                                                      float* y) const {      \
+      const float* x, float* y) const {                                      \
    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
@@ -367,7 +367,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
      y[i] = 0.f - y[i];                                                     \
    }                                                                        \
-    vexp_->Compute(y + this->end_, y + this->end_);                          \
+    vexp_->ComputeDeprecated(y + this->end_, y + this->end_);                \
    for (int i = this->end_; i < this->num_; ++i) {                          \
      y[i] = 1.f / (1.f + y[i]);                                             \
    }                                                                        \
@@ -408,10 +408,10 @@ class VTanhKernelImpl : public VTanhKernel<T> {
    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<T>>(d);
    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<T>>(d);
  }
-  void Compute(const T* x, T* y) const override {
+  void ComputeDeprecated(const T* x, T* y) const override {
    const T a = static_cast<T>(2), b = static_cast<T>(-1);
    vscal_->Compute(&a, x, y, this->num_);
-    vsigmoid_->Compute(y, y);
+    vsigmoid_->ComputeDeprecated(y, y);
    vscal_->Compute(&a, y, y, this->num_);
    vaddbias_->Compute(&b, y, y, this->num_);
  }
@@ -430,25 +430,25 @@ class VTanhKernelImpl : public VTanhKernel<T> {
  tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp);          \
  tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
-#define INTRI8_FLOAT(isa, expisa)                                           \
+#define INTRI8_FLOAT(isa, expisa)                                             \
-  template <>                                                               \
+  template <>                                                                 \
-  void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+  void VTanhKernelImpl<float, isa, kEQ8>::ComputeDeprecated(const float* x,   \
-      const {                                                               \
+                                                            float* y) const { \
-    __m256 tmp = _mm256_loadu_ps(x);                                        \
+    __m256 tmp = _mm256_loadu_ps(x);                                          \
-    INTRI_VTANH(tmp, expisa);                                               \
+    INTRI_VTANH(tmp, expisa);                                                 \
-    _mm256_storeu_ps(y, tmp);                                               \
+    _mm256_storeu_ps(y, tmp);                                                 \
  }
-#define INTRI16_FLOAT(isa, expisa)                                           \
+#define INTRI16_FLOAT(isa, expisa)                                             \
-  template <>                                                                \
+  template <>                                                                  \
-  void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
+  void VTanhKernelImpl<float, isa, kEQ16>::ComputeDeprecated(const float* x,   \
-      const {                                                                \
+                                                             float* y) const { \
-    __m256 tmp0 = _mm256_loadu_ps(x);                                        \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                          \
-    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                    \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                      \
-    INTRI_VTANH(tmp0, expisa);                                               \
+    INTRI_VTANH(tmp0, expisa);                                                 \
-    INTRI_VTANH(tmp1, expisa);                                               \
+    INTRI_VTANH(tmp1, expisa);                                                 \
-    _mm256_storeu_ps(y, tmp0);                                               \
+    _mm256_storeu_ps(y, tmp0);                                                 \
-    _mm256_storeu_ps(y + 8, tmp1);                                           \
+    _mm256_storeu_ps(y + 8, tmp1);                                             \
  }
 #define INTRI_GT8LT16_FLOAT(isa, expisa)                                      \
@@ -466,8 +466,8 @@ class VTanhKernelImpl : public VTanhKernel<T> {
        this->rest_);                                                         \
  }                                                                           \
  template <>                                                                 \
-  void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,         \
+  void VTanhKernelImpl<float, isa, kGT8LT16>::ComputeDeprecated(              \
-                                                      float* y) const {       \
+      const float* x, float* y) const {                                       \
    __m256 tmp = _mm256_loadu_ps(x);                                          \
    INTRI_VTANH(tmp, expisa);                                                 \
    _mm256_storeu_ps(y, tmp);                                                 \
@@ -475,40 +475,40 @@ class VTanhKernelImpl : public VTanhKernel<T> {
    y += AVX_FLOAT_BLOCK;                                                     \
    const float a = 2.f, b = -1.f;                                            \
    vscal_->Compute(&a, x, y, this->num_);                                    \
-    vsigmoid_->Compute(y, y);                                                 \
+    vsigmoid_->ComputeDeprecated(y, y);                                       \
    vscal_->Compute(&a, y, y, this->num_);                                    \
    vaddbias_->Compute(&b, y, y, this->num_);                                 \
  }
-#define INTRI_GT16_FLOAT(isa, expisa)                                         \
+#define INTRI_GT16_FLOAT(isa, expisa)                                          \
-  template <>                                                                 \
+  template <>                                                                  \
-  VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d)                  \
+  VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d)                   \
-      : VTanhKernel<float>() {                                                \
+      : VTanhKernel<float>() {                                                 \
-    this->num_ = d;                                                           \
+    this->num_ = d;                                                            \
-    this->rest_ = d % AVX_FLOAT_BLOCK;                                        \
+    this->rest_ = d % AVX_FLOAT_BLOCK;                                         \
-    this->end_ = d - this->rest_;                                             \
+    this->end_ = d - this->rest_;                                              \
-    vscal_ =                                                                  \
+    vscal_ =                                                                   \
-        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
+        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_);  \
-    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(   \
+    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(    \
-        this->rest_);                                                         \
+        this->rest_);                                                          \
-    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(   \
+    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(    \
-        this->rest_);                                                         \
+        this->rest_);                                                          \
-  }                                                                           \
+  }                                                                            \
-  template <>                                                                 \
+  template <>                                                                  \
-  void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y)  \
+  void VTanhKernelImpl<float, isa, kGT16>::ComputeDeprecated(const float* x,   \
-      const {                                                                 \
+                                                             float* y) const { \
-    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                   \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                    \
-      __m256 tmp = _mm256_loadu_ps(x + i);                                    \
+      __m256 tmp = _mm256_loadu_ps(x + i);                                     \
-      INTRI_VTANH(tmp, expisa);                                               \
+      INTRI_VTANH(tmp, expisa);                                                \
-      _mm256_storeu_ps(y + i, tmp);                                           \
+      _mm256_storeu_ps(y + i, tmp);                                            \
-    }                                                                         \
+    }                                                                          \
-    x += this->end_;                                                          \
+    x += this->end_;                                                           \
-    y += this->end_;                                                          \
+    y += this->end_;                                                           \
-    const float a = 2.f, b = -1.f;                                            \
+    const float a = 2.f, b = -1.f;                                             \
-    vscal_->Compute(&a, x, y, this->num_);                                    \
+    vscal_->Compute(&a, x, y, this->num_);                                     \
-    vsigmoid_->Compute(y, y);                                                 \
+    vsigmoid_->ComputeDeprecated(y, y);                                        \
-    vscal_->Compute(&a, y, y, this->num_);                                    \
+    vscal_->Compute(&a, y, y, this->num_);                                     \
-    vaddbias_->Compute(&b, y, y, this->num_);                                 \
+    vaddbias_->Compute(&b, y, y, this->num_);                                  \
  }
 #ifdef __AVX__

--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -175,26 +175,26 @@ class LSTMKernelImpl : public LSTMKernel<T> {
  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
                   T* checked) const override {
    // gates: W_ch, W_ih, W_fh, W_oh
-    act_gate_d3_->Compute(gates + d_, gates + d_);
+    act_gate_d3_->ComputeDeprecated(gates + d_, gates + d_);
    /* C_t = C_t-1 * fgated + cand_gated * igated */
-    act_cand_d_->Compute(gates, gates);
+    act_cand_d_->ComputeDeprecated(gates, gates);
    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_);
+    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
    /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_);
+    act_gate_d_->ComputeDeprecated(gates + d_, gates + d_);
-    act_cand_d_->Compute(gates, gates);
+    act_cand_d_->ComputeDeprecated(gates, gates);
    vmul_d_->Compute(gates, gates + d_, ct, d_);
    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
-    act_cell_d_->Compute(ct, gates + d2_);
+    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
@@ -292,32 +292,32 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
    vmul_d_->Compute(wp_data, ct_1, checked, d_);
    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
    vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
-    act_gate_d2_->Compute(gates + d_, gates + d_);
+    act_gate_d2_->ComputeDeprecated(gates + d_, gates + d_);
    /* C_t = C_t-1 * fgated + cand_gated * igated*/
-    act_cand_d_->Compute(gates, gates);
+    act_cand_d_->ComputeDeprecated(gates, gates);
    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
    /* get ogated*/
    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_);
+    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
    /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_);
+    act_gate_d_->ComputeDeprecated(gates + d_, gates + d_);
-    act_cand_d_->Compute(gates, gates);
+    act_cand_d_->ComputeDeprecated(gates, gates);
    vmul_d_->Compute(gates, gates + d_, ct, d_);
    /* get outgated, put W_oc * C_t on igated */
    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_);
-    act_cell_d_->Compute(ct, gates + d2_);
+    act_cell_d_->ComputeDeprecated(ct, gates + d2_);
    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
  }
@@ -376,20 +376,20 @@ class GRUKernelImpl : public GRUKernel<T> {
  }
  void ComputeH1(T* gates, T* ht) const override {
-    act_gate_d_->Compute(gates, gates);
+    act_gate_d_->ComputeDeprecated(gates, gates);
-    act_state_d_->Compute(gates + d2_, gates + d2_);
+    act_state_d_->ComputeDeprecated(gates + d2_, gates + d2_);
    vmul_d_->Compute(gates, gates + d2_, ht, d_);
  }
  void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
    // W: {W_update, W_reset; W_state}
-    act_gate_d2_->Compute(gates, gates);
+    act_gate_d2_->ComputeDeprecated(gates, gates);
    vmul_d_->Compute(ht_1, gates + d_, ht, d_);
  }
  void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
    T* y = gates + d2_;
-    act_state_d_->Compute(y, y);
+    act_state_d_->ComputeDeprecated(y, y);
    // out = zt*ht~ + (1-zt)*ht_1
    for (int i = 0; i < d_; ++i) {
      ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];

--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -92,7 +92,7 @@ TEST(JitKernel, vrelu) {
 #endif
    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data);
+      ker->Compute(x_data, ztgt_data, d);
    }
    auto ttgte = GetCurrentUS();
    VLOG(30) << "Vec size " << d
@@ -181,7 +181,7 @@ TEST(JitKernel, vexp) {
    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data);
+      ker->ComputeDeprecated(x_data, ztgt_data);
    }
    auto ttgte = GetCurrentUS();
@@ -222,7 +222,7 @@ void vsigmoid_better(
    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
    y[i] = 0.f - y[i];
  }
-  vexp->Compute(y, y);
+  vexp->ComputeDeprecated(y, y);
  for (int i = 0; i < n; ++i) {
    y[i] = 1.f / (1.f + y[i]);
  }
@@ -253,7 +253,7 @@ TEST(JitKernel, vsigmoid) {
    auto trefe = GetCurrentUS();
    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data);
+      ker->ComputeDeprecated(x_data, ztgt_data);
    }
    auto ttgte = GetCurrentUS();
@@ -287,7 +287,7 @@ void vtanh_better(
    const int n, const float* x, float* y) {
  const float a = 2.f, b = -1.f;
  vscal->Compute(&a, x, y, n);
-  vsigmoid->Compute(y, y);
+  vsigmoid->ComputeDeprecated(y, y);
  vscal->Compute(&a, y, y, n);
  vaddbias->Compute(&b, y, y, n);
 }
@@ -321,7 +321,7 @@ TEST(JitKernel, vtanh) {
    auto trefe = GetCurrentUS();
    auto ttgts = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data);
+      ker->ComputeDeprecated(x_data, ztgt_data);
    }
    auto ttgte = GetCurrentUS();
@@ -344,8 +344,8 @@ void lstm_ctht_ref(
    const std::shared_ptr<
        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
-  vsigmoid_3d->Compute(gates + d, gates + d);
+  vsigmoid_3d->ComputeDeprecated(gates + d, gates + d);
-  vtanh_d->Compute(gates, gates);
+  vtanh_d->ComputeDeprecated(gates, gates);
  const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
  const float min = SIGMOID_THRESHOLD_MIN;
  const float max = SIGMOID_THRESHOLD_MAX;
@@ -355,7 +355,7 @@ void lstm_ctht_ref(
    // H_t = act_cell(C_t) * ogated
    float tmp = ct[k] * 2;
    tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vexp_1->Compute(&tmp, &tmp);
+    vexp_1->ComputeDeprecated(&tmp, &tmp);
    tmp = 2.f / (1.f + tmp) - 1.f;
    ht[k] = tmp * o[k];
  }
@@ -373,13 +373,13 @@ void lstm_ctht_better(
        const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd_d,
    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
  int d2 = d * 2;
-  vsigmoid_3d->Compute(gates + d, gates + d);
+  vsigmoid_3d->ComputeDeprecated(gates + d, gates + d);
-  vtanh_d->Compute(gates, gates);
+  vtanh_d->ComputeDeprecated(gates, gates);
  vmul_d->Compute(gates, gates + d, gates + d, d);
  vmul_d->Compute(ct_1, gates + d2, gates + d2, d);
  vadd_d->Compute(gates + d, gates + d2, ct, d);
  /* H_t = act_cell(C_t) * ogated */
-  vtanh_d->Compute(ct, gates + d2);
+  vtanh_d->ComputeDeprecated(ct, gates + d2);
  vmul_d->Compute(gates + d2, gates + d * 3, ht, d);
 }
@@ -736,7 +736,7 @@ void vaddrelu_better(
        const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
    const float* x, const float* y, float* z, int d) {
  vadd->Compute(x, y, z, d);
-  vrelu->Compute(z, z);
+  vrelu->ComputeDeprecated(z, z);
 }
 TEST(JitKernel, vaddrelu) {

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -244,7 +244,7 @@ typename std::enable_if<
    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
 elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
                   size_t data_len, const T* in, T* out) {
-  for (int64_t i = 0; i < data_len; i++) {
+  for (size_t i = 0; i < data_len; i++) {
    out[i] += in[i];
  }
 }

--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -70,11 +70,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
  EXPECT_EQ(in_grad.lod(), lod);
  if (paddle::platform::is_cpu_place(*place)) {
-    for (int64_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
+    for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
      int64_t begin = in_grad.lod()[0][i];
      int64_t end = in_grad.lod()[0][i + 1];
      paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
-      for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
+      for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
        for (int64_t m = 0; m != second_dim; ++m) {
          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
                    out_grad.data<T>()[m + i * second_dim]);
@@ -82,11 +82,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
      }
    }
  } else {
-    for (int64_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
+    for (size_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
      int64_t begin = cpu_in_grad.lod()[0][i];
      int64_t end = cpu_in_grad.lod()[0][i + 1];
      paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
-      for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
+      for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
        for (int64_t m = 0; m != second_dim; ++m) {
          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
                    cpu_out_grad.data<T>()[m + i * second_dim]);

--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -19,8 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
-template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;

--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
-template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
+                              false>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
+                              true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,

--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 namespace math {
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, bool is_test>
 class SoftmaxFunctor {
 public:
  void operator()(const DeviceContext& context, const framework::Tensor* X,

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -32,10 +32,10 @@ struct ValueClip {
  }
 };
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, bool is_test>
-void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
-                                                  const framework::Tensor* X,
+    const DeviceContext& context, const framework::Tensor* X,
-                                                  framework::Tensor* Y) {
+    framework::Tensor* Y) {
  auto logits = EigenMatrix<T>::From(*X);
  auto softmax = EigenMatrix<T>::From(*Y);
@@ -65,6 +65,39 @@ void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                                                 .broadcast(one_by_class));
 }
+template <typename DeviceContext, typename T>
+class SoftmaxFunctor<DeviceContext, T, true> {
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y) {
+    auto logits = EigenMatrix<T>::From(*X);
+    auto softmax = EigenMatrix<T>::From(*Y);
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+    softmax.device(*context.eigen_device()) = shifted_logits.exp();
+    softmax.device(*context.eigen_device()) = (softmax *
+                                               softmax.sum(along_class)
+                                                   .inverse()
+                                                   .eval()
+                                                   .reshape(batch_by_one)
+                                                   .broadcast(one_by_class));
+  }
+};
 template <typename DeviceContext, typename T>
 void SoftmaxGradFunctor<DeviceContext, T>::operator()(
    const DeviceContext& context, const framework::Tensor* y,

--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
@@ -43,11 +43,11 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
                      "the number of Ids and Out should be the same");
-    int row_ids_size = 0;
+    size_t row_ids_size = 0;
    int row_size = 0;
    int embedding_size = 0;
-    for (int i = 0; i < x_tensors.size(); ++i) {
+    for (size_t i = 0; i < x_tensors.size(); ++i) {
      const auto *x_tensor = x_tensors[i];
      const auto *row_id = row_ids[i];
@@ -66,7 +66,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
    std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
        selected_rows_idx_map;
-    for (int i = 0; i < x_tensors.size(); ++i) {
+    for (size_t i = 0; i < x_tensors.size(); ++i) {
      const auto *row_id = row_ids[i];
      for (int j = 0; j < row_id->numel(); ++j) {
@@ -78,7 +78,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
                      "the rows and tensor map size should be the same");
-    for (int i = 0; i < outs.size(); ++i) {
+    for (size_t i = 0; i < outs.size(); ++i) {
      auto *out_ids = ids[i];
      auto *out = outs[i];

--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -74,7 +74,7 @@ PadConstantLikeOp Operator.
 Pad input(Y) with a pad_value, the number of values padded to the edges of each
 axis is specified by the difference of the shape of X and Y.
-((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
+((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for
 each axis.
 The input should be a k-D tensor(k > 0 and k < 7). As an example:

--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -87,6 +87,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    bool is_test = ctx.Attr<bool>("is_test");
    if (ctx.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
@@ -142,16 +143,10 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
          CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
                              padding_right_bottom, ksize, pooling_type,
-                              mkldnn_engine, ceil_mode);
+                              mkldnn_engine, ceil_mode, is_test);
      // save pool_pd into global device context to be referred in backward path
-      dev_ctx.SetBlob(key_pool_pd, pool_pd);
+      if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd);
-      std::shared_ptr<mkldnn::memory> workspace_memory =
-          CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
-      // save pool_workspace_memory to be referred in backward path
-      dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
                                                 to_void_cast<T>(input_data));
@@ -161,9 +156,19 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
      dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
-      pool_p = std::make_shared<pooling_forward>(*pool_pd, *(src_memory.get()),
+      if (is_test) {
-                                                 *(dst_memory.get()),
+        pool_p = std::make_shared<pooling_forward>(*pool_pd, *src_memory,
-                                                 *workspace_memory);
+                                                   *dst_memory);
+      } else {
+        std::shared_ptr<mkldnn::memory> workspace_memory =
+            CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
+        // save pool_workspace_memory to be referred in backward path
+        dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
+        pool_p = std::make_shared<pooling_forward>(
+            *pool_pd, *src_memory, *dst_memory, *workspace_memory);
+      }
      dev_ctx.SetBlob(key_pool_p, pool_p);
@@ -201,9 +206,12 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      const std::vector<int>& stride, const std::vector<int>& padding_left_top,
      const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
      const std::string& pooling_type, const mkldnn::engine& engine,
-      bool ceil_mode) const {
+      bool ceil_mode, bool is_test) const {
+    auto mkldnn_forward_prop_kind = is_test
+                                        ? mkldnn::prop_kind::forward_inference
+                                        : mkldnn::prop_kind::forward_training;
    auto pool_desc = mkldnn::pooling_forward::desc(
-        mkldnn::prop_kind::forward,
+        mkldnn_forward_prop_kind,
        pooling_type == "max" ? mkldnn::algorithm::pooling_max
                              : mkldnn::algorithm::pooling_avg,
        src, dst, stride, kernel, padding_left_top, padding_right_bot,
@@ -248,6 +256,10 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       out_grad->format() != memory::format::format_undef,
                   "Wrong layout/format set for Input output_grad tensor");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -206,6 +206,11 @@ void Pool2dOpMaker::Make() {
      "Defaults to \"NHWC\". Specify the data format of the output data, "
      "the input will be transformed automatically. ")
      .SetDefault("AnyLayout");
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
  // TODO(dzhwinter): need to registered layout transform function
  AddComment(R"DOC(

--- a/paddle/fluid/operators/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.h
@@ -38,7 +38,7 @@ class RefByTrainerIdKernel : public framework::OpKernel<T> {
    } else {
      trainer_id = *trainer_id_data;
    }
-    PADDLE_ENFORCE_LT(trainer_id, in_list.size());
+    PADDLE_ENFORCE_LT((size_t)trainer_id, in_list.size());
    out->mutable_data<T>(context.GetPlace());
    out->ShareDataWith(*(in_list[trainer_id]));
  }

--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -122,7 +122,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor), "
              "Argmaxes corresponding to indices in X used "
              "for gradient computation. Only output "
-              "if arg “is_test” is false.")
+              "if arg \"is_test\" is false.")
        .AsIntermediate();
    AddAttr<float>("spatial_scale",
                   "(float, default 1.0), "

--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/selu_op.h"
+#include <string>
+namespace paddle {
+namespace operators {
+class SeluOp : public framework::OperatorWithKernel {
+ public:
+  SeluOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SeluOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SeluOp should not be null.");
+    ctx->ShareDim("X", /*->*/ "Out");
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.GetPlace());
+  }
+};
+class SeluOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+class SeluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor of selu operator.");
+    AddOutput("Out", "The output tensor of selu operator.");
+    AddAttr<float>("scale",
+                   "(float) the default value is 1.0507~. For more "
+                   "information about this value, please refer to:"
+                   "https://arxiv.org/abs/1706.02515.")
+        .SetDefault(1.0507009873554804934193349852946);
+    AddAttr<float>("alpha",
+                   "(float) the default value is 1.6732~. For more "
+                   "information about this value, please refer to:"
+                   "https://arxiv.org/abs/1706.02515.")
+        .SetDefault(1.6732632423543772848170429916717);
+    AddComment(R"DOC(
+Selu Operator.
+The equation is:
+$$
+f(x) =\lambda*
+\begin{cases}
+ \quad \quad   x,  \quad \quad \quad \text{if} \ x > 0 \\
+ \alpha * e^x - \alpha,  \qquad  \text{if} \ x <= 0
+\end{cases}
+$$
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD information with input `X`.
+)DOC");
+  }
+};
+class SeluGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("selu_grad");
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class SeluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null");
+    auto x_grad_name = framework::GradVarName("X");
+    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("Out"));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::GetDataTypeOfVar(ctx.InputVar("Out")), ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
+                  ops::SeluGradMaker);
+REGISTER_OPERATOR(selu_grad, ops::SeluGradOp);
+REGISTER_OP_CPU_KERNEL(
+    selu, ops::SeluKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SeluKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    selu_grad, ops::SeluGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SeluGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/selu_op.cu
+++ b/paddle/fluid/operators/selu_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/selu_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    selu, ops::SeluKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SeluKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    selu_grad, ops::SeluGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SeluGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/selu_op.h
+++ b/paddle/fluid/operators/selu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+namespace paddle {
+namespace operators {
+static HOSTDEVICE float real_exp(float x) { return expf(x); }
+static HOSTDEVICE float real_exp(double x) { return exp(x); }
+template <typename T>
+struct SeluFunctor {
+  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
+      : x_data_ptr_(x_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        y_data_ptr_(y_data_ptr) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    T x_ele = x_data_ptr_[idx];
+    if (x_ele <= 0) {
+      x_ele = alpha_ * real_exp(x_ele) - alpha_;
+    }
+    y_data_ptr_[idx] = scale_ * x_ele;
+  }
+  const T* x_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  T* y_data_ptr_;
+};
+template <typename T>
+struct SeluGradFunctor {
+  SeluGradFunctor(const T* y_data_ptr, const T* dy_data_ptr, float alpha,
+                  float scale, T* dx_data_ptr)
+      : y_data_ptr_(y_data_ptr),
+        dy_data_ptr_(dy_data_ptr),
+        alpha_(alpha),
+        scale_(scale),
+        la_(alpha * scale),
+        dx_data_ptr_(dx_data_ptr) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    T y_ele = y_data_ptr_[idx];
+    T dy_ele = dy_data_ptr_[idx];
+    float tmp = scale_;
+    if (y_ele <= 0) {
+      tmp = y_ele + la_;
+    }
+    dx_data_ptr_[idx] = dy_ele * tmp;
+  }
+  const T* y_data_ptr_;
+  const T* dy_data_ptr_;
+  const float alpha_;
+  const float scale_;
+  const float la_;
+  T* dx_data_ptr_;
+};
+template <typename DeviceContext, typename T>
+class SeluKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    float alpha = context.Attr<float>("alpha");
+    float scale = context.Attr<float>("scale");
+    auto out_ptr = out->mutable_data<T>(context.GetPlace());
+    SeluFunctor<T> functor(x->data<T>(), alpha, scale, out_ptr);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(x->numel());
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
+  }
+};
+template <typename DeviceContext, typename T>
+class SeluGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    auto* out = context.Input<Tensor>("Out");
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    float alpha = context.Attr<float>("alpha");
+    float scale = context.Attr<float>("scale");
+    auto dx_ptr = dx->mutable_data<T>(context.GetPlace());
+    SeluGradFunctor<T> functor(out->data<T>(), dout->data<T>(), alpha, scale,
+                               dx_ptr);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(out->numel());
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -47,7 +47,10 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor<int>) This tensor is used for the sequence max-pooling "
              "to record the max indexes.")
        .AsIntermediate();
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddAttr<std::string>(
        "pooltype",
        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")

--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -96,20 +96,21 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(bool, default false) Only used in mkldnn kernel")
        .SetDefault(false);
    AddAttr<bool>("is_test",
-                  "Disable epsilon adding to softmax results. Used by MKLDNN.")
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
        .SetDefault(false);
    AddComment(R"DOC(
 Softmax Operator.
-The input of the softmax operator is a tensor of any rank. The output tensor 
+The input of the softmax operator is a tensor of any rank. The output tensor
 has the same shape as the input.
-The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+The input tensor will first be logically flattened to a 2-D matrix. The matrix's
-second dimension(row length) is as same as the last dimension of the input 
+second dimension(row length) is as same as the last dimension of the input
-tensor, and the first dimension(column length) is the product of all other 
+tensor, and the first dimension(column length) is the product of all other
-dimensions of the input tensor. For each row of the matrix, the softmax operator 
+dimensions of the input tensor. For each row of the matrix, the softmax operator
-squashes the K-dimensional(K is the width of the matrix, which is also the size 
+squashes the K-dimensional(K is the width of the matrix, which is also the size
-of the input tensor's last dimension) vector of arbitrary real values to a 
+of the input tensor's last dimension) vector of arbitrary real values to a
 K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.

--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel<T> {
    Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
-    math::SoftmaxFunctor<DeviceContext, T>()(
+#ifdef ON_INFER
+    math::SoftmaxFunctor<DeviceContext, T, true>()(
        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+#else
+    math::SoftmaxFunctor<DeviceContext, T, false>()(
+        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+#endif
  }
 };

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
    auto& dev_ctx =
        context.template device_context<platform::CPUDeviceContext>();
-    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
-                                                          softmax);
+        dev_ctx, logits, softmax);
    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
        context.Attr<int>("ignore_index"));

--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -64,7 +64,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
      out_ids.resize(outs.size());
      // split id by their shard_num.
-      for (int i = 0; i < all_ids.size(); ++i) {
+      for (size_t i = 0; i < all_ids.size(); ++i) {
        T id = all_ids[i];
        size_t shard_id = static_cast<size_t>(id) % shard_num;
        out_ids[shard_id].push_back(id);

--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -57,8 +57,8 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
 $(N, C_{out}, H_{out}, W_{out})$, where
 $$
-H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
-W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1]
 $$
 Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
 )DOC");

--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/warpctc_op.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+namespace paddle {
+namespace operators {
+#if CUDNN_VERSION >= 7001
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedCTCLossDescriptor = platform::ScopedCTCLossDescriptor;
+using DataLayout = platform::DataLayout;
+template <typename DeviceContext, typename T>
+class CudnnCTCKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // =====================Copied code from warpctc===========================
+    auto* logits = ctx.Input<LoDTensor>("Logits");
+    auto* label = ctx.Input<LoDTensor>("Label");
+    auto* warpctc_grad = ctx.Output<LoDTensor>("WarpCTCGrad");
+    auto* loss = ctx.Output<LoDTensor>("Loss");
+    const size_t level = 0;
+    auto logits_lod = framework::ToAbsOffset(logits->lod());
+    auto logits_dims = logits->dims();
+    PADDLE_ENFORCE_EQ(logits_dims[0],
+                      static_cast<int64_t>(logits_lod[level].back()),
+                      "The first dimension of Input(Logits) should be equal to "
+                      "the sum of all sequences' lengths.");
+    auto label_lod = framework::ToAbsOffset(label->lod());
+    auto label_dims = label->dims();
+    PADDLE_ENFORCE_EQ(
+        label_dims[0], label->numel(),
+        "The width of each timestep in Input(Label) should be 1.");
+    const size_t num_sequences = logits_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
+                      "The number of sequences of Input(Logits) should be "
+                      "equal to that of Input(Label).");
+    PADDLE_ENFORCE_LE(num_sequences, 256,
+                      "The labelLengths must less than 256 for cudnn call.");
+    const size_t sequence_width = logits->numel() / logits_dims[0];
+    auto loss_dims =
+        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
+    // NOTE: cudnn takes softmax input, calculate softmax first, then do padding
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    LoDTensor softmax_logits;
+    softmax_logits.mutable_data<T>(logits->dims(), ctx.GetPlace());
+    softmax_logits.set_lod(logits_lod);
+    int rank = logits->dims().size();
+    Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+    Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1);
+    math::SoftmaxFunctor<DeviceContext, T, false>()(dev_ctx, &in_2d, &out_2d);
+    // ctc needs sequences data stored in transposed padding format
+    // logits and grad using padding data of layout 'TNC'
+    // T: max_sequence_length
+    // N: batch_size (num_sequences)
+    // C: width
+    LoDTensor warpctc_logits;
+    const size_t max_sequence_length =
+        math::MaximumSequenceLength(logits_lod[level]);
+    auto warpctc_logits_dims =
+        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                              static_cast<int64_t>(num_sequences),
+                              static_cast<int64_t>(sequence_width)});
+    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
+    LoDTensor cpu_pad_value;
+    T* pad_value_data =
+        cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
+    *pad_value_data = static_cast<T>(0);
+    LoDTensor pad_value;
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      pad_value = cpu_pad_value;
+    } else {
+      TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
+    }
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), softmax_logits,
+        &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */,
+        math::kLengthBatchWidth);
+    const T* warpctc_logits_data = warpctc_logits.data<T>();
+    std::vector<int> warpctc_label_lengths(num_sequences);
+    std::vector<int> warpctc_logits_lengths(num_sequences);
+    for (size_t i = 0; i < num_sequences; ++i) {
+      warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
+      warpctc_logits_lengths[i] =
+          logits_lod[level][i + 1] - logits_lod[level][i];
+    }
+    T* warpctc_grad_data =
+        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), warpctc_grad,
+        static_cast<T>(0));
+    Tensor warpctc_label;
+    TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
+    const int* warpctc_label_data = warpctc_label.data<int>();
+    // ========================================================================
+    ScopedTensorDescriptor logits_desc;
+    ScopedTensorDescriptor grad_desc;
+    ScopedCTCLossDescriptor ctcloss_desc;
+    // layout here doesn't have effect.
+    DataLayout layout = DataLayout::kNCHW;
+    auto cu_logits_desc = logits_desc.descriptor<T>(
+        layout, framework::vectorize2int(warpctc_logits.dims()));
+    auto cu_grad_desc = grad_desc.descriptor<T>(
+        layout, framework::vectorize2int(warpctc_grad->dims()));
+    auto cu_ctcloss_desc = ctcloss_desc.descriptor<T>();
+    auto handle = dev_ctx.cudnn_handle();
+    size_t workspace_size;
+    CUDNN_ENFORCE(platform::dynload::cudnnGetCTCLossWorkspaceSize(
+        handle, cu_logits_desc, cu_grad_desc, warpctc_label_data,
+        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+        CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size));
+    T* loss_data = loss->mutable_data<T>(loss_dims, ctx.GetPlace());
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto cudnn_func = [&](void* cudnn_workspace) {
+      CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
+          handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
+          warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+          loss_data, cu_grad_desc, warpctc_grad_data,
+          CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace,
+          workspace_size));
+    };
+    workspace_handle.RunFunc(cudnn_func, workspace_size);
+  }
+};
+template <typename DeviceContext, typename T>
+class CudnnCTCGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
+    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
+    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *warpctc_grad,
+        logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
+    const T* loss_grad_data = loss_grad->data<T>();
+    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), loss_grad_data,
+        logits_grad);
+  }
+};
+#endif
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+#if CUDNN_VERSION >= 7001
+REGISTER_OP_KERNEL(
+    warpctc, CUDNN, plat::CUDAPlace,
+    ops::CudnnCTCKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_KERNEL(
+    warpctc_grad, CUDNN, plat::CUDAPlace,
+    ops::CudnnCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
+#endif
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/warpctc_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -45,9 +49,16 @@ class WarpCTCOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
+        ctx.device_context(), layout_, library_);
  }
 };
@@ -86,6 +97,10 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
                  "normalize the gradients by the number of time-step, "
                  "which is also the sequence's length.")
        .SetDefault(false);
+    AddAttr<bool>("use_cudnn",
+                  "(bool, default: false), whether to "
+                  "use cudnn kernel.")
+        .SetDefault(false);
    AddComment(R"DOC(
 An operator integrating the open-source
 [warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in

--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -92,7 +92,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
              "variables generated in the i'th step.");
    AddAttr<framework::BlockDesc *>(kStepBlock,
                                    "The step block inside WhileOp");
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddComment(R"DOC(
 )DOC");
  }

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -380,5 +380,28 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
  return use_cudnn;
 }
+#if CUDNN_VERSION >= 7001
+class ScopedCTCLossDescriptor {
+ public:
+  ScopedCTCLossDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateCTCLossDescriptor(&desc_));
+  }
+  ~ScopedCTCLossDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyCTCLossDescriptor(desc_));
+  }
+  template <typename T>
+  inline cudnnCTCLossDescriptor_t descriptor() {
+    PADDLE_ENFORCE(
+        dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type));
+    return desc_;
+  }
+ private:
+  cudnnCTCLossDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor);
+};
+#endif
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -154,7 +154,13 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
  __macro(cudnnSetConvolutionGroupCount);  \
-  __macro(cudnnSetConvolutionMathType);
+  __macro(cudnnSetConvolutionMathType);    \
+  __macro(cudnnCreateCTCLossDescriptor);   \
+  __macro(cudnnDestroyCTCLossDescriptor);  \
+  __macro(cudnnGetCTCLossDescriptor);      \
+  __macro(cudnnSetCTCLossDescriptor);      \
+  __macro(cudnnGetCTCLossWorkspaceSize);   \
+  __macro(cudnnCTCLoss);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -112,6 +112,14 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  }
  places.emplace_back(platform::CPUPlace());
  platform::DeviceContextPool::Init(places);
+// windows has no support for openblas multi-thread
+#ifdef _WIN32
+  if (FLAGS_paddle_num_threads > 1) {
+    FLAGS_paddle_num_threads = 1;
+  }
+#endif
 #ifndef PADDLE_WITH_MKLDNN
  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
@@ -167,7 +175,9 @@ void InitGLOG(const std::string &prog_name) {
  // glog will not hold the ARGV[0] inside.
  // Use strdup to alloc a new string.
  google::InitGoogleLogging(strdup(prog_name.c_str()));
+#ifndef _WIN32
  google::InstallFailureSignalHandler();
+#endif
 }
 }  // namespace framework

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#ifndef _WIN32
 #pragma once
 #include <stdio.h>
@@ -149,3 +150,4 @@ struct NCCLContextMap {
 }  // namespace platform
 }  // namespace paddle
+#endif
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -24,21 +24,16 @@
 #include "glog/logging.h"
 #if !defined(_WIN32)
-#define UNUSED __attribute__((unused))
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
 #include <algorithm>  // std::accumulate
 #else
 #include <io.h>  // _popen, _pclose
+#include <stdio.h>
 #include <windows.h>
-#if defined(_WIN32)
 #include <numeric>  // std::accumulate in msvc
-#endif
+#ifndef S_ISDIR     // windows port for sys/stat.h
-// windows version of __attribute__((unused))
-#define UNUSED __pragma(warning(suppress : 4100))
-#ifndef S_ISDIR  // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
 #endif  // S_ISDIR

--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -42,3 +42,11 @@ limitations under the License. */
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
 #include <boost/variant.hpp>
+// some platform-independent defintion
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,9 +2,9 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
-list(APPEND PYBIND_DEPS parallel_executor profiler)
+  list(APPEND PYBIND_DEPS parallel_executor profiler)
-list(APPEND PYBIND_SRCS recordio.cc)
+  list(APPEND PYBIND_SRCS recordio.cc)
-endif()
+endif(NOT WIN32)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED
@@ -21,5 +21,13 @@ if(WITH_PYTHON)
    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
  endif(WITH_AMD_GPU)
+  if(WIN32)
+    if(WITH_GPU AND NOT WITH_DSO)
+      get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+      target_link_libraries(paddle_pybind ${cuda_modules})
+    endif(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_pybind shlwapi)
+  endif(WIN32)
  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,6 +21,13 @@ limitations under the License. */
 #include <utility>
 #include <vector>
+#if defined(_WIN32)
+#define NOMINMAX
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
+#include <Windows.h>
+#endif
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -29,7 +36,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
+#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
+#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -50,7 +59,9 @@ limitations under the License. */
 #include "paddle/fluid/string/to_string.h"
 #ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
 #include "paddle/fluid/platform/cuda_profiler.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@@ -340,22 +351,25 @@ All parameter, weight, gradient are variables in Paddle.
      .def("get_lod_tensor_array",
           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
           py::return_value_policy::reference)
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
      .def("get_communicator",
           [](Variable &self) -> platform::Communicator * {
             return self.GetMutable<platform::Communicator>();
           },
           py::return_value_policy::reference)
-#endif
      .def("get_reader",
           [](Variable &self) -> framework::ReaderHolder * {
             PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
             return self.GetMutable<framework::ReaderHolder>();
           },
-           py::return_value_policy::reference);
+           py::return_value_policy::reference)
+#endif
+      ;
+#if !defined(_WIN32)
  py::class_<framework::ReaderHolder>(m, "Reader", "")
      .def("reset", &framework::ReaderHolder::ResetAll);
+#endif
  using LoDTensorBlockingQueue =
      ::paddle::operators::reader::LoDTensorBlockingQueue;
@@ -480,7 +494,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
                });;
 // clang-format on
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
@@ -617,11 +631,14 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_CUDA
  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+#ifndef _WIN32
  m.def("nvprof_init", platform::CudaProfilerInit);
  m.def("nvprof_start", platform::CudaProfilerStart);
  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
+#endif
+#ifndef _WIN32
  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
      .value("kDisabled", platform::ProfilerState::kDisabled)
      .value("kCPU", platform::ProfilerState::kCPU)
@@ -642,6 +659,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("disable_profiler", platform::DisableProfiler);
  m.def("is_profiler_enabled", platform::IsProfileEnabled);
  m.def("reset_profiler", platform::ResetProfiler);
+#endif
  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
  pass.def(py::init())
@@ -650,9 +668,9 @@ All parameter, weight, gradient are variables in Paddle.
          [](ir::Pass &self, const std::string &name, const std::string &attr) {
            self.Set<std::string>(name, new std::string(attr));
          })
-      .def("set_int", [](ir::Pass &self, const std::string &name, int val) {
+      .def("set_int", [](ir::Pass &self, const std::string &name,
-        self.Set<const int>(name, new int(val));
+                         int val) { self.Set<const int>(name, new int(val)); })
-      });
+      .def("type", &ir::Pass::Type);
  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
      m, "PassBuilder");
@@ -670,6 +688,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("remove_pass",
           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
+#ifndef _WIN32
  // -- python binds for parallel executor.
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -791,6 +810,7 @@ All parameter, weight, gradient are variables in Paddle.
          "reduce_strategy",
          [](const BuildStrategy &self) { return self.reduce_; },
          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.reduce_ = strategy;
          },
          R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
@@ -804,6 +824,7 @@ All parameter, weight, gradient are variables in Paddle.
          [](const BuildStrategy &self) { return self.gradient_scale_; },
          [](BuildStrategy &self,
             BuildStrategy::GradientScaleStrategy strategy) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.gradient_scale_ = strategy;
          },
          R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
@@ -815,6 +836,7 @@ All parameter, weight, gradient are variables in Paddle.
          "debug_graphviz_path",
          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
          [](BuildStrategy &self, const std::string &path) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.debug_graphviz_path_ = path;
          },
          R"DOC(The type is STR, debug_graphviz_path indicate the path that
@@ -824,6 +846,7 @@ All parameter, weight, gradient are variables in Paddle.
          "enable_data_balance",
          [](const BuildStrategy &self) { return self.enable_data_balance_; },
          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.enable_data_balance_ = b;
          })  // FIXME(chengudo): enable_data_balance seems not important
      .def_property(
@@ -832,6 +855,7 @@ All parameter, weight, gradient are variables in Paddle.
            return self.enable_sequential_execution_;
          },
          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.enable_sequential_execution_ = b;
          },
          R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC")
@@ -841,6 +865,7 @@ All parameter, weight, gradient are variables in Paddle.
            return self.remove_unnecessary_lock_;
          },
          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.remove_unnecessary_lock_ = b;
          },
          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
@@ -850,15 +875,19 @@ All parameter, weight, gradient are variables in Paddle.
            return self.fuse_elewise_add_act_ops_;
          },
          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.fuse_elewise_add_act_ops_ = b;
          },
          R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                     to fuse elementwise_add_op and activation_op,
                     it may make the execution faster. Default False)DOC")
-      .def("_create_passes_from_strategy",
+      .def("_finalize_strategy_and_create_passes",
           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-             return self.CreatePassesFromStrategy();
+             return self.CreatePassesFromStrategy(true);
-           });
+           },
+           R"DOC(Allow user to customized passes. Normally model-specific
+                optimization passes should be defined in this way. BuildStrategy
+                cannot be updated after being finalized.)DOC");
  pe.def(py::init<const std::vector<platform::Place> &,
                  const std::unordered_set<std::string> &,
@@ -887,6 +916,7 @@ All parameter, weight, gradient are variables in Paddle.
      });
  BindRecordIOWriter(&m);
+#endif
  return m.ptr();
 }
 }  // namespace pybind

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -156,6 +156,8 @@ function cmake_gen() {
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
        -DPY_VERSION=${PY_VERSION:-2.7}
        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
    ========================================
@@ -188,6 +190,8 @@ EOF
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
        -DPY_VERSION=${PY_VERSION:-2.7} \
        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
@@ -777,6 +781,17 @@ function main() {
        test_fluid_lib
        assert_api_spec_approvals
        ;;
+      assert_api)
+        assert_api_not_changed ${PYTHON_ABI:-""}
+        ;;
+      test_inference)
+        gen_capi_package
+        gen_fluid_lib
+        test_fluid_lib
+        ;;
+      assert_api_approvals)
+        assert_api_spec_approvals
+        ;;
      maccheck)
        cmake_gen ${PYTHON_ABI:-""}
        build_mac

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -45,23 +45,42 @@ endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+IF(WIN32)
-set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+    # Python would use the .pyd by default under Windows series platform
-add_custom_command(OUTPUT ${FLUID_CORE}
+    set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+    get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
-        DEPENDS paddle_pybind)
+    set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+            COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
+            DEPENDS paddle_pybind)
+ELSE()
+    set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+            DEPENDS paddle_pybind)
+ENDIF()
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
+IF(WIN32)
-add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND touch stub.cc
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
-    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
-    COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
+            COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+            COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+            DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib.* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+ELSE(WIN32)
-    DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+		COMMAND touch stub.cc
+		COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
+		COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
+		COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+ENDIF()
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
 if(NOT WITH_FLUID_ONLY)

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import print_function
+import os
 # import all class inside framework into fluid module
 from . import framework
 from .framework import *
@@ -111,12 +112,16 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'dist_threadpool_size', 'eager_delete_tensor_gb',
        'reader_queue_speed_test_mode'
    ]
+    if os.name != 'nt':
+        read_env_flags.append('warpctc_dir')
+        read_env_flags.append('cpu_deterministic')
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')
        read_env_flags.append('rpc_server_profile_path')

--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,13 +15,15 @@
 from __future__ import print_function
 import contextlib
+import os
 from .. import core
 from .. import executor
 from .. import framework
 from .. import io
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place

--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,7 +28,8 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from ..transpiler import distribute_transpiler
 __all__ = [

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import contextlib
 import multiprocessing
+import os
 import six
 import threading
@@ -346,70 +347,72 @@ def _copy_reader_create_op_(block, op):
    return new_op
-@templatedoc(op_type='create_recordio_file_reader')
+if os.name != 'nt':
-def open_recordio_file(filename,
-                       shapes,
+    @templatedoc(op_type='create_recordio_file_reader')
-                       lod_levels,
+    def open_recordio_file(filename,
-                       dtypes,
+                           shapes,
-                       pass_num=1,
+                           lod_levels,
-                       for_parallel=True):
+                           dtypes,
-    """
+                           pass_num=1,
-    ${comment}
+                           for_parallel=True):
+        """
-    Args:
+        ${comment}
-       filename(${filename_type}): ${filename_comment}.
-       shapes(list): List of tuples which declaring data shapes.
+        Args:
-       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+           filename(${filename_type}): ${filename_comment}.
-       dtypes(list): List of strs which declaring data type.
+           shapes(list): List of tuples which declaring data shapes.
-       pass_num(int): Number of passes to run.
+           lod_levels(${lod_levels_type}): ${lod_levels_comment}.
-       for_parallel(Bool): Set it as True if you are going to run
+           dtypes(list): List of strs which declaring data type.
-            subsequent operators in parallel.
+           pass_num(int): Number of passes to run.
+           for_parallel(Bool): Set it as True if you are going to run
-    Returns:
+                subsequent operators in parallel.
-       ${out_comment}.
+        Returns:
-    Examples:
+           ${out_comment}.
-        >>> import paddle.fluid as fluid
+        Examples:
-        >>> reader = fluid.layers.io.open_recordio_file(
-        >>>                               filename='./data.recordio',
+            >>> import paddle.fluid as fluid
-        >>>                               shapes=[(3,224,224), (1)],
+            >>> reader = fluid.layers.io.open_recordio_file(
-        >>>                               lod_levels=[0, 0],
+            >>>                               filename='./data.recordio',
-        >>>                               dtypes=['float32', 'int64'])
+            >>>                               shapes=[(3,224,224), (1)],
-        >>> # Via the reader, we can use 'read_file' layer to get data:
+            >>>                               lod_levels=[0, 0],
-        >>> image, label = fluid.layers.io.read_file(reader)
+            >>>                               dtypes=['float32', 'int64'])
-    """
+            >>> # Via the reader, we can use 'read_file' layer to get data:
-    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+            >>> image, label = fluid.layers.io.read_file(reader)
-    shape_concat = []
+        """
-    ranks = []
+        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+        shape_concat = []
+        ranks = []
-    for shape in shapes:
+        for shape in shapes:
-        shape_concat.extend(shape)
+            shape_concat.extend(shape)
-        ranks.append(len(shape))
+            ranks.append(len(shape))
-    var_name = unique_name('open_recordio_file')
+        var_name = unique_name('open_recordio_file')
-    startup_blk = default_startup_program().current_block()
+        startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
+        startup_var = startup_blk.create_var(name=var_name)
-    startup_blk.append_op(
+        startup_blk.append_op(
-        type='create_recordio_file_reader',
+            type='create_recordio_file_reader',
-        outputs={'Out': [startup_var]},
+            outputs={'Out': [startup_var]},
-        attrs={
+            attrs={
-            'shape_concat': shape_concat,
+                'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
+                'lod_levels': lod_levels,
-            'filename': filename,
+                'filename': filename,
-            'ranks': ranks
+                'ranks': ranks
-        })
+            })
-    startup_var.desc.set_dtypes(dtypes)
+        startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
+        startup_var.persistable = True
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+        main_prog_var = _copy_reader_var_(
-                                      startup_var)
+            default_main_program().current_block(), startup_var)
-    if pass_num > 1:
+        if pass_num > 1:
-        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+            main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
-    return monkey_patch_reader_methods(main_prog_var)
+        return monkey_patch_reader_methods(main_prog_var)
 def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -18,6 +18,7 @@ All layers just related to the neural network.
 from __future__ import print_function
 import numpy as np
+import os
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable, OpProtoHolder
@@ -109,6 +110,7 @@ __all__ = [
    'random_crop',
    'mean_iou',
    'relu',
+    'selu',
    'log',
    'crop',
    'rank_loss',
@@ -341,126 +343,128 @@ def embedding(input,
    return tmp
-@templatedoc(op_type="lstm")
+if os.name != 'nt':
-def dynamic_lstm(input,
-                 size,
-                 h_0=None,
-                 c_0=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_peepholes=True,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 cell_activation='tanh',
-                 candidate_activation='tanh',
-                 dtype='float32',
-                 name=None):
-    """
-    ${comment}
-    Args:
-        input (Variable): ${input_comment}
-        size (int): 4 * hidden size.
-        h_0(Variable): The initial hidden state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size and D is the hidden size.
-        c_0(Variable): The initial cell state is an optional input, default is zero.
-                       This is a tensor with shape (N x D), where N is the
-                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-                               hidden-hidden weights.
-                               - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                W_{fh}, W_{oh}`}
-                               - The shape is (D x 4D), where D is the hidden
-                                 size.
-                               If it is set to None or one attribute of ParamAttr,
-                               dynamic_lstm will create ParamAttr as param_attr.
-                               If the Initializer of the param_attr is not set, the
-                               parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                              weights, which contains two parts, input-hidden
-                              bias weights and peephole connections weights if
-                              setting `use_peepholes` to `True`.
-                              1. `use_peepholes = False`
+    @templatedoc(op_type="lstm")
-                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+    def dynamic_lstm(input,
-                                 - The shape is (1 x 4D).
+                     size,
-                              2. `use_peepholes = True`
+                     h_0=None,
-                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                     c_0=None,
-                                                 W_{fc}, W_{oc}`}.
+                     param_attr=None,
-                                 - The shape is (1 x 7D).
+                     bias_attr=None,
+                     use_peepholes=True,
-                              If it is set to None or one attribute of ParamAttr,
+                     is_reverse=False,
-                              dynamic_lstm will create ParamAttr as bias_attr.
+                     gate_activation='sigmoid',
-                              If the Initializer of the bias_attr is not set,
+                     cell_activation='tanh',
-                              the bias is initialized zero. Default: None.
+                     candidate_activation='tanh',
-        use_peepholes (bool): ${use_peepholes_comment}
+                     dtype='float32',
-        is_reverse (bool): ${is_reverse_comment}
+                     name=None):
-        gate_activation (str): ${gate_activation_comment}
+        """
-        cell_activation (str): ${cell_activation_comment}
+        ${comment}
-        candidate_activation (str): ${candidate_activation_comment}
-        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        Args:
-        name (str|None): A name for this layer(optional). If set None, the layer
+            input (Variable): ${input_comment}
-                         will be named automatically.
+            size (int): 4 * hidden size.
+            h_0(Variable): The initial hidden state is an optional input, default is zero.
-    Returns:
+                           This is a tensor with shape (N x D), where N is the
-        tuple: The hidden state, and cell state of LSTM. The shape of both \
+                           batch size and D is the hidden size.
-        is (T x D), and lod is the same with the `input`.
+            c_0(Variable): The initial cell state is an optional input, default is zero.
+                           This is a tensor with shape (N x D), where N is the
-    Examples:
+                           batch size. `h_0` and `c_0` can be NULL but only at the same time.
-        .. code-block:: python
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                                   hidden-hidden weights.
-            hidden_dim = 512
-            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                   - Weights = {:math:`W_{ch}, W_{ih}, \
-                                           bias_attr=False)
+                                                    W_{fh}, W_{oh}`}
-            forward, _ = fluid.layers.dynamic_lstm(
+                                   - The shape is (D x 4D), where D is the hidden
-                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+                                     size.
-    """
-    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+                                   If it is set to None or one attribute of ParamAttr,
-    helper = LayerHelper('lstm', **locals())
+                                   dynamic_lstm will create ParamAttr as param_attr.
-    size = size // 4
+                                   If the Initializer of the param_attr is not set, the
-    weight = helper.create_parameter(
+                                   parameter is initialized with Xavier. Default: None.
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+            bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-    bias_size = [1, 7 * size]
+                                  weights, which contains two parts, input-hidden
-    if not use_peepholes:
+                                  bias weights and peephole connections weights if
-        bias_size[1] = 4 * size
+                                  setting `use_peepholes` to `True`.
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+                                  1. `use_peepholes = False`
+                                     - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                     - The shape is (1 x 4D).
+                                  2. `use_peepholes = True`
+                                     - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                     W_{fc}, W_{oc}`}.
+                                     - The shape is (1 x 7D).
+                                  If it is set to None or one attribute of ParamAttr,
+                                  dynamic_lstm will create ParamAttr as bias_attr.
+                                  If the Initializer of the bias_attr is not set,
+                                  the bias is initialized zero. Default: None.
+            use_peepholes (bool): ${use_peepholes_comment}
+            is_reverse (bool): ${is_reverse_comment}
+            gate_activation (str): ${gate_activation_comment}
+            cell_activation (str): ${cell_activation_comment}
+            candidate_activation (str): ${candidate_activation_comment}
+            dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+            name (str|None): A name for this layer(optional). If set None, the layer
+                             will be named automatically.
+        Returns:
+            tuple: The hidden state, and cell state of LSTM. The shape of both \
+            is (T x D), and lod is the same with the `input`.
+        Examples:
+            .. code-block:: python
+                hidden_dim = 512
+                forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                               bias_attr=False)
+                forward, _ = fluid.layers.dynamic_lstm(
+                    input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+        """
+        assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+        helper = LayerHelper('lstm', **locals())
+        size = size // 4
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+        bias_size = [1, 7 * size]
+        if not use_peepholes:
+            bias_size[1] = 4 * size
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-    hidden = helper.create_variable_for_type_inference(dtype)
+        hidden = helper.create_variable_for_type_inference(dtype)
-    cell = helper.create_variable_for_type_inference(dtype)
+        cell = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
+        batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+        batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+        inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    batch_size = input.shape[0]
+        batch_size = input.shape[0]
-    if h_0:
+        if h_0:
-        assert h_0.shape == (batch_size, size), \
+            assert h_0.shape == (batch_size, size), \
-            'The shape of h0 should be (batch_size, %d)' % size
+                'The shape of h0 should be (batch_size, %d)' % size
-        inputs['H0'] = h_0
+            inputs['H0'] = h_0
-    if c_0:
+        if c_0:
-        assert c_0.shape == (batch_size, size), \
+            assert c_0.shape == (batch_size, size), \
-            'The shape of c0 should be (batch_size, %d)' % size
+                'The shape of c0 should be (batch_size, %d)' % size
-        inputs['C0'] = c_0
+            inputs['C0'] = c_0
-    helper.append_op(
+        helper.append_op(
-        type='lstm',
+            type='lstm',
-        inputs=inputs,
+            inputs=inputs,
-        outputs={
+            outputs={
-            'Hidden': hidden,
+                'Hidden': hidden,
-            'Cell': cell,
+                'Cell': cell,
-            'BatchGate': batch_gate,
+                'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
+                'BatchCellPreAct': batch_cell_pre_act
-        },
+            },
-        attrs={
+            attrs={
-            'use_peepholes': use_peepholes,
+                'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
+                'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
+                'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
+                'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
+                'candidate_activation': candidate_activation
-        })
+            })
-    return hidden, cell
+        return hidden, cell
 def dynamic_lstmp(input,
@@ -959,39 +963,43 @@ def linear_chain_crf(input, label, param_attr=None):
    return log_likelihood
-@templatedoc()
+if os.name != 'nt':
-def crf_decoding(input, param_attr, label=None):
-    """
-    ${comment}
-    Args:
+    @templatedoc()
-        input(${emission_type}): ${emission_comment}
+    def crf_decoding(input, param_attr, label=None):
+        """
+        ${comment}
-        param_attr(ParamAttr): The parameter attribute for training.
+        Args:
+            input(${emission_type}): ${emission_comment}
-        label(${label_type}): ${label_comment}
+            param_attr(ParamAttr): The parameter attribute for training.
-    Returns:
+            label(${label_type}): ${label_comment}
-        Variable: ${viterbi_path_comment}
-    Examples:
+        Returns:
-        .. code-block:: python
+            Variable: ${viterbi_path_comment}
-           crf_decode = layers.crf_decoding(
+        Examples:
-                input=hidden, param_attr=ParamAttr(name="crfw"))
+            .. code-block:: python
-    """
-    helper = LayerHelper('crf_decoding', **locals())
+               crf_decode = layers.crf_decoding(
-    transition = helper.get_parameter(param_attr.name)
+                    input=hidden, param_attr=ParamAttr(name="crfw"))
-    viterbi_path = helper.create_variable_for_type_inference(
+        """
-        dtype=helper.input_dtype())
+        helper = LayerHelper('crf_decoding', **locals())
-    helper.append_op(
+        transition = helper.get_parameter(param_attr.name)
-        type='crf_decoding',
+        viterbi_path = helper.create_variable_for_type_inference(
-        inputs={"Emission": [input],
+            dtype=helper.input_dtype())
+        helper.append_op(
+            type='crf_decoding',
+            inputs={
+                "Emission": [input],
                "Transition": transition,
-                "Label": label},
+                "Label": label
-        outputs={"ViterbiPath": [viterbi_path]})
+            },
+            outputs={"ViterbiPath": [viterbi_path]})
-    return viterbi_path
+        return viterbi_path
 @templatedoc()
@@ -4179,7 +4187,7 @@ def ctc_greedy_decoder(input, blank, name=None):
    return ctc_out
-def warpctc(input, label, blank=0, norm_by_times=False):
+def warpctc(input, label, blank=0, norm_by_times=False, use_cudnn=False):
    """
    An operator integrating the open source Warp-CTC library
    (https://github.com/baidu-research/warp-ctc)
@@ -4204,6 +4212,7 @@ def warpctc(input, label, blank=0, norm_by_times=False):
         by the number of time-step, which is also the sequence's length.
         There is no need to normalize the gradients if warpctc layer was
         follewed by a mean_op.
+       use_cudnn (bool, default false): Whether to use cudnn.
    Returns:
        Variable: The Connectionist Temporal Classification (CTC) loss,
@@ -4227,8 +4236,11 @@ def warpctc(input, label, blank=0, norm_by_times=False):
                'Label': [label]},
        outputs={'WarpCTCGrad': [grad_out],
                 'Loss': [loss_out]},
-        attrs={'blank': blank,
+        attrs={
-               'norm_by_times': norm_by_times})
+            'blank': blank,
+            'norm_by_times': norm_by_times,
+            'use_cudnn': use_cudnn
+        })
    return loss_out
@@ -5538,42 +5550,48 @@ def label_smooth(label,
    return smooth_label
-@templatedoc()
+if os.name != 'nt':
-def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
-    """
+    @templatedoc()
-    ${comment}
+    def roi_pool(input,
+                 rois,
-    Args:
+                 pooled_height=1,
-        input (Variable): ${x_comment}
+                 pooled_width=1,
-        rois (Variable): ROIs (Regions of Interest) to pool over.
+                 spatial_scale=1.0):
-        pooled_height (integer): ${pooled_height_comment} Default: 1
+        """
-        pooled_width (integer): ${pooled_width_comment} Default: 1
+        ${comment}
-        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+        Args:
-    Returns:
+            input (Variable): ${x_comment}
-        Variable: ${out_comment}.
+            rois (Variable): ROIs (Regions of Interest) to pool over.
+            pooled_height (integer): ${pooled_height_comment} Default: 1
-    Examples:
+            pooled_width (integer): ${pooled_width_comment} Default: 1
-        .. code-block:: python
+            spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+        Returns:
-    """
+            Variable: ${out_comment}.
-    helper = LayerHelper('roi_pool', **locals())
-    dtype = helper.input_dtype()
+        Examples:
-    pool_out = helper.create_variable_for_type_inference(dtype)
+            .. code-block:: python
-    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
+                pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
-        type="roi_pool",
+        """
-        inputs={"X": input,
+        helper = LayerHelper('roi_pool', **locals())
-                "ROIs": rois},
+        dtype = helper.input_dtype()
-        outputs={"Out": pool_out,
+        pool_out = helper.create_variable_for_type_inference(dtype)
-                 "Argmax": argmaxes},
+        argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-        attrs={
+        helper.append_op(
-            "pooled_height": pooled_height,
+            type="roi_pool",
-            "pooled_width": pooled_width,
+            inputs={"X": input,
-            "spatial_scale": spatial_scale
+                    "ROIs": rois},
-        })
+            outputs={"Out": pool_out,
-    return pool_out
+                     "Argmax": argmaxes},
+            attrs={
+                "pooled_height": pooled_height,
+                "pooled_width": pooled_width,
+                "spatial_scale": spatial_scale
+            })
+        return pool_out
 @templatedoc()
@@ -6169,6 +6187,47 @@ def relu(x, name=None):
    return out
+@templatedoc()
+def selu(x, scale=None, alpha=None, name=None):
+    """
+    ${comment}
+    Args:
+        x (Variable): The input tensor.
+        scale(float, None): If the scale is not set,
+            the default value is 1.0507009873554804934193349852946.
+            For more information about this value, please refer
+            to: https://arxiv.org/abs/1706.02515.
+        alpha(float, None): If the alpha is not set,
+            the default value is 1.6732632423543772848170429916717.
+            For more information about this value, please refer
+            to: https://arxiv.org/abs/1706.02515.
+        name (str|None, default None): A name for this layer If set None,
+            the layer will be named automatically.
+    Returns:
+        Variable: The output tensor with the same shape as input.
+    Examples:
+        .. code-block:: python
+            output = fluid.layers.selu(x)
+    """
+    helper = LayerHelper('selu', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    attrs = {}
+    if scale is not None:
+        attrs["scale"] = scale
+    if alpha is not None:
+        attrs["alpha"] = alpha
+    helper.append_op(
+        type="selu", inputs={"X": x}, outputs={"Out": out}, attrs=attrs)
+    return out
 def mean_iou(input, label, num_classes):
    """
    Mean Intersection-Over-Union is a common evaluation metric for
@@ -6822,7 +6881,7 @@ def prelu(x, mode, param_attr=None, name=None):
        alpha_shape = x.shape
    dtype = helper.input_dtype(input_param_name='x')
    alpha = helper.create_parameter(
-        attr=param_attr,
+        attr=helper.param_attr,
        shape=alpha_shape,
        dtype='float32',
        is_bias=False,

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import print_function
+import os
 from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_
@@ -99,27 +100,26 @@ Examples:
    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
-__all__ += ['cumsum']
+if os.name != 'nt':
+    __all__ += ['cumsum']
-_cum_sum_ = generate_layer_fn('cumsum')
+    _cum_sum_ = generate_layer_fn('cumsum')
-def cumsum(x, axis=None, exclusive=None, reverse=None):
+    def cumsum(x, axis=None, exclusive=None, reverse=None):
-    locals_var = locals().keys()
+        locals_var = locals().keys()
-    kwargs = dict()
+        kwargs = dict()
-    for name in locals_var:
+        for name in locals_var:
-        val = locals()[name]
+            val = locals()[name]
-        if val is not None:
+            if val is not None:
-            kwargs[name] = val
+                kwargs[name] = val
-    return _cum_sum_(**kwargs)
+        return _cum_sum_(**kwargs)
+    cumsum.__doc__ = _cum_sum_.__doc__ + """
-cumsum.__doc__ = _cum_sum_.__doc__ + """
+    Examples:
-Examples:
+        >>> data = fluid.layers.data(name="input", shape=[32, 784])
-    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+        >>> result = fluid.layers.cumsum(data, axis=0)
-    >>> result = fluid.layers.cumsum(data, axis=0)
+    """
-"""
 __all__ += ['thresholded_relu']

--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -26,6 +26,7 @@ from multiprocessing import Process
 from functools import reduce
 import numpy as np
+import pickle
 import unittest
 import six
@@ -166,7 +167,10 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
                io.save_persistables(startup_exe, model_dir, trainer_prog)
        var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
-        print(np.ravel(var).tolist())
+        if six.PY2:
+            print(pickle.dumps(np.ravel(var).tolist()))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -105,7 +105,7 @@ class TestDistRunnerBase(object):
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
        if args.batch_merge_repeat > 1:
-            pass_builder = build_stra._create_passes_from_strategy()
+            pass_builder = build_stra._finalize_strategy_and_create_passes()
            mypass = pass_builder.insert_pass(
                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
            mypass.set_int("num_repeats", args.batch_merge_repeat)

--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -65,14 +65,14 @@ class TestDistSaveLoadDense2x2(TestDistBase):
        shutil.rmtree(model_dir)
-        local_np = np.array(eval(local_var[0]))
+        local_np = np.array(local_var)
-        train0_np = np.array(eval(tr0_var[0]))
+        train0_np = np.array(tr0_var)
-        train1_np = np.array(eval(tr1_var[0]))
+        train1_np = np.array(tr1_var)
        self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta)
        self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta)
        self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta)
-    @unittest.skip(reason="CI fail")
    def test_dist(self):
        need_envs = {
            "IS_DISTRIBUTED": '0',

--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -83,6 +83,34 @@ class TestInferShape(unittest.TestCase):
        mul_op_desc.infer_shape(block)
        self.assertEqual(out.shape(), [x_shape[0], y_shape[1]])
+    def test_expand_op(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+        shape = [-1, 20]
+        expand_times = [3, 1]
+        # prepare input/output
+        x1 = block.var(six.b("x"))
+        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x1.set_shape(shape)
+        out = block.var(six.b("out"))
+        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        # prepare the operator
+        sum_op_desc = block.append_op()
+        sum_op_desc.set_type("expand")
+        sum_op_desc.set_input("X", ["x"])
+        sum_op_desc.set_output("Out", ["out"])
+        sum_op_desc._set_attr('expand_times', expand_times)
+        sum_op_desc.check_attrs()
+        sum_op_desc.infer_shape(block)
+        self.assertEqual(out.shape(), shape)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -94,7 +94,12 @@ class TestPassBuilder(unittest.TestCase):
    def test_parallel_testing_with_new_strategy(self):
        build_strategy = fluid.BuildStrategy()
-        pass_builder = build_strategy._create_passes_from_strategy()
+        self.assertFalse(build_strategy.fuse_elewise_add_act_ops)
+        build_strategy.fuse_elewise_add_act_ops = True
+        pass_builder = build_strategy._finalize_strategy_and_create_passes()
+        self.assertTrue("fuse_elewise_add_act_pass" in
+                        [p.type() for p in pass_builder.all_passes()])
        origin_len = len(pass_builder.all_passes())
        viz_pass = pass_builder.append_pass("graph_viz_pass")

--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import six
+from op_test import OpTest
+class SeluTest(OpTest):
+    def setUp(self):
+        self.op_type = "selu"
+        self.x_shape = [3, 5, 5, 10]
+        self.dtype = np.float32
+        self.init_x_shape()
+        self.init_dtype()
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
+        x = np.random.normal(size=self.x_shape).astype(self.dtype)
+        # Since zero point in selu is not differentiable, avoid randomize
+        # zero.
+        x[np.abs(x) < 0.005] = 0.02
+        x_flat = x.flatten()
+        for i in range(x_flat.size):
+            if x_flat[i] < 0:
+                x_flat[i] = alpha * np.exp(x_flat[i]) - alpha
+            x_flat[i] = scale * x_flat[i]
+        out_np = x_flat.reshape(self.x_shape)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_np}
+        self.attrs = {
+            'alpha': alpha,
+            'scale': scale,
+        }
+    def init_x_shape(self):
+        pass
+    def init_dtype(self):
+        pass
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -183,6 +183,7 @@ class TestWarpCTCOp(OpTest):
        self.labels_lod = [[3, 1, 4, 4]]
        self.blank = self.num_classes - 1
        self.norm_by_times = False
+        self.use_cudnn = False
    def setUp(self):
        self.op_type = "warpctc"
@@ -215,7 +216,11 @@ class TestWarpCTCOp(OpTest):
            "Label": (labels, self.labels_lod)
        }
        self.outputs = {"Loss": loss}
-        self.attrs = {"blank": self.blank, "norm_by_times": self.norm_by_times}
+        self.attrs = {
+            "blank": self.blank,
+            "norm_by_times": self.norm_by_times,
+            "use_cudnn": self.use_cudnn
+        }
    def test_check_output(self):
        self.check_output()
@@ -233,6 +238,22 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
        self.labels_lod = [[3, 1, 4, 4]]
        self.blank = 0
        self.norm_by_times = False
+        self.use_cudnn = False
+class TestCudnnCTCOp(TestWarpCTCOp):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = 8
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
+        self.blank = 0
+        self.norm_by_times = False
+        self.use_cudnn = True
+    def test_check_grad(self):
+        self.outputs['WarpCTCGrad'] = self.gradient
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.01)
 if __name__ == "__main__":

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -73,6 +73,38 @@ class InferenceTranspiler(object):
                program)  # ResNet residual block merging
            self._fuse_bn_relu_mkldnn(program)
+        self._is_test_pass(program)
+    def _is_test_pass(self, program):
+        '''
+        Transpile the program setting is_test = true for all layers and
+        inserts is_test attribute to pooling and activation layers.
+        As a result some operators might run faster
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.has_attr("is_test"):
+                current_op._set_attr("is_test", True)
+            elif current_op.type in [
+                    "pool2d", "sigmoid", "logsigmoid", "softshrink", "exp",
+                    "brelu", "pow", "leaky_relu", "stanh", "relu", "tanh",
+                    "tanh_shrink", "sqrt", "abs", "ceil", "elu", "floor", "cos",
+                    "sin", "round", "reciprocal", "hard_shrink", "hard_sigmoid",
+                    "relu6", "soft_relu", "swish", "thresholded_relu", "log",
+                    "square", "softplus", "softsign"
+            ]:
+                current_op._set_attr("is_test", True)
+            i = i + 1
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
    def _depthwise_conv_mkldnn(self, program):
        '''
        Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -9,7 +9,7 @@ class BinaryDistribution(Distribution):
 RC      = 0
+ext_name = '.dll' if os.name == 'nt' else '.so'
 def git_commit():
    try:
@@ -136,10 +136,13 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                   '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
                   '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data={'paddle.fluid': ['core.so']}
+package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
+if os.name == 'nt':
+    package_data['paddle.fluid'] += ['openblas' + ext_name]
 if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_data['paddle.v2.master']=['libpaddle_master.so']
+    package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
-    package_data['py_paddle']=['*.py','_swig_paddle.so']
+    package_data['py_paddle']=['*.py','_swig_paddle' + ext_name]
 package_dir={
    '': '${PADDLE_BINARY_DIR}/python',
@@ -153,13 +156,15 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
    package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
 # put all thirdparty libraries in paddle.libs
-package_data['paddle.libs']=['libwarpctc.so']
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+if os.name != 'nt':
+    package_data['paddle.libs']= []
+    package_data['paddle.libs']=['libwarpctc' + ext_name]
+    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_LIB}', libs_path)
    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
-    package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
+    package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
 if '${CMAKE_BUILD_TYPE}' == 'Release':
    # only change rpath in Release mode.
    if '${WITH_MKLDNN}' == 'ON':
@@ -187,36 +192,47 @@ if '${WITH_NGRAPH}' == 'ON':
                                  '${NGRAPH_CPU_LIB_NAME}',
                                  '${NGRAPH_TBB_LIB_NAME}']
 # remove unused paddle/libs/__init__.py
-os.remove(libs_path+'/__init__.py')
+if os.path.isfile(libs_path+'/__init__.py'):
+    os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
-# change rpath of core.so, add $ORIGIN/../libs/ to it.
+# change rpath of core.ext, add $ORIGIN/../libs/ to it.
-# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
+# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
-# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# core.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
 if '${CMAKE_BUILD_TYPE}' == 'Release':
-    # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
+    if os.name != 'nt':
-    if "@APPLE@" == "1":
+        # only change rpath in Release mode, since in Debug mode, core.xx is too large to be changed.
-        command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-    else:
-        command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-    if os.system(command) != 0:
-        raise Exception("patch core.so failed, command: %s" % command)
-    if '${WITH_FLUID_ONLY}'== 'OFF':
-        # change rpath of _swig_paddle.so.
        if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
        else:
-            command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
        if os.system(command) != 0:
-            raise Exception("patch _swig_paddle.so failed, command: %s" % command)
+            raise Exception("patch core.%s failed, command: %s" % (ext_name, command))
+        if '${WITH_FLUID_ONLY}'== 'OFF':
+            # change rpath of _swig_paddle.xx.
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
+            if os.system(command) != 0:
+                raise Exception("patch _swig_paddle.%s failed, command: %s" % (ext_name, command))
+ext_modules = [Extension('_foo', ['stub.cc'])]
+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in package_dir.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    package_dir = fix_package_dir
+    ext_modules = []
 setup(name='${PACKAGE_NAME}',
      version='${PADDLE_VERSION}',
      description='Parallel Distributed Deep Learning',
      install_requires=setup_requires,
      packages=packages,
-      ext_modules=[Extension('_foo', ['stub.cc'])],
+      ext_modules=ext_modules,
      package_data=package_data,
      package_dir=package_dir,
      scripts=paddle_bins