Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_mnist

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_mnist
test=develop
d4b9928c · minqiyang · 5822f7f1 · 719ebe37 · d4b9928c · d4b9928c
220 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,16 +126,12 @@ if(ANDROID OR IOS)
    add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
-if (APPLE OR WIN32)
+if (APPLE)
    set(WITH_MKL OFF CACHE STRING
-        "Disable MKL for building on mac and windows" FORCE)
+        "Disable MKL for building on mac" FORCE)
 endif()
 if (WIN32)
-    set(WITH_DSO OFF CACHE STRING
-            "Disable DSO when compiling for Windows" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-            "Disable MKL when compiling for Windows" FORCE)
    set(WITH_DISTRIBUTE OFF CACHE STRING
            "Disable DISTRIBUTE when compiling for Windows" FORCE)
    set(WITH_C_API OFF CACHE STRING

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -44,9 +44,9 @@ if(WIN32)
 set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)
-if(Apple)
+if(APPLE)
 set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
-endif(Apple)
+endif(APPLE)
 find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -23,15 +23,14 @@ SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-IF(WIN32 OR APPLE)
+IF(APPLE)
    MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLDNN in Paddle yet."
+        "Mac is not supported with MKLDNN in Paddle yet."
        "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE)
    return()
 ENDIF()
-SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
 MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
@@ -44,10 +43,14 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+IF(NOT WIN32)
-SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+ENDIF(NOT WIN32)
 ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -58,8 +61,15 @@ ExternalProject_Add(
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
@@ -67,6 +77,11 @@ ExternalProject_Add(
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
+if(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
+else(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+endif(WIN32)
 ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
@@ -85,10 +100,14 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
-SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+if(WIN32)
-ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll)
-    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+else(WIN32)
-    DEPENDS mkldnn)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+            DEPENDS mkldnn)
+endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
 IF(WITH_C_API)

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,56 +16,67 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
-IF(WIN32 OR APPLE)
+IF(APPLE)
    MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Mac is not supported with MKLML in Paddle yet."
        "Force WITH_MKLML=OFF")
    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
    return()
 ENDIF()
 INCLUDE(ExternalProject)
-SET(MKLML_PROJECT       "extern_mklml")
-IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
-  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
+if(WIN32)
-SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+else()
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
+endif()
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
-INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+    MESSAGE(STATUS "use pre defined download url")
+    if(WIN32)
+        SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    else()
+        SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    ENDIF()
+endif()
-FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
+SET(MKLML_PROJECT       "extern_mklml")
-  "PROJECT(MKLML)\n"
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-  "cmake_minimum_required(VERSION 3.0)\n"
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
-  "        DESTINATION ${MKLML_DST_DIR})\n")
 ExternalProject_Add(
    ${MKLML_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${MKLML_SOURCE_DIR}
+    PREFIX                 ${MKLML_SOURCE_DIR}
+    URL                    ${MKLML_URL}
    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
-                          && tar zxf ${MKLML_VER}.tgz
    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
+    CONFIGURE_COMMAND     ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
+    BUILD_COMMAND         ""
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} &&
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR}
 )
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,7 +267,11 @@ function(cc_library TARGET_NAME)
          list(APPEND cc_library_DEPS dynload_mklml)
        endif()
        add_dependencies(${TARGET_NAME} mklml)
-        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        if(WIN32)
+          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
+        else(WIN32)
+          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        endif(WIN32)
      endif()
      # remove link to python, see notes at:
      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -115,20 +115,20 @@ if (NOT PROTOBUF_FOUND OR WIN32)
            )
 endif ()
-if (NOT CBLAS_FOUND)
+if (WITH_MKLML)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
-    copy(openblas_lib
-            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-            DSTS ${dst_dir} ${dst_dir}
-            DEPS extern_openblas
-            )
-elseif (WITH_MKLML)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
    copy(mklml_lib
            SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
            DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
            DEPS mklml
            )
+elseif (NOT CBLAS_FOUND OR WIN32)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
+    copy(openblas_lib
+            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS extern_openblas
+            )
 endif ()
 if (WITH_MKLDNN)

--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,46 +57,43 @@ int main()
    return 0;
 }" SSE3_FOUND)
-# disable AVX by default on windows
+# Check AVX
-if(NOT WIN32)
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-    # Check AVX
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+CHECK_CXX_SOURCE_RUNS("
-    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+#include <immintrin.h>
-    CHECK_CXX_SOURCE_RUNS("
+int main()
-    #include <immintrin.h>
+{
-    int main()
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    {
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
-        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    return 0;
-        __m256 result = _mm256_add_ps (a, b);
+}" AVX_FOUND)
-        return 0;
-    }" AVX_FOUND)
-    # Check AVX 2
+# Check AVX 2
-    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
+CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
+#include <immintrin.h>
-    int main()
+int main()
-    {
+{
-        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-        __m256i result = _mm256_abs_epi32 (a);
+    __m256i result = _mm256_abs_epi32 (a);
-        return 0;
+    return 0;
-    }" AVX2_FOUND)
+}" AVX2_FOUND)
-    # Check AVX512F
+# Check AVX512F
-    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
+CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
+#include <immintrin.h>
-    int main()
+int main()
-    {
+{
-        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                      13, -5, 6, -7, 9, 2, -6, 3);
+                                  13, -5, 6, -7, 9, 2, -6, 3);
-        __m512i result = _mm512_abs_epi32 (a);
+    __m512i result = _mm512_abs_epi32 (a);
-        return 0;
+    return 0;
-    }" AVX512F_FOUND)
+}" AVX512F_FOUND)
-endif(NOT WIN32)
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -60,7 +60,7 @@ class Float16Transpiler:
            raise TypeError("place should be as CPUPlace/CUDAPlace type")
        if scope is None:
            scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
            raise TypeError("scope should be as Scope type or None")
        self.scope = scope

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -208,6 +208,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
@@ -350,6 +351,23 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None))
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
@@ -446,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
-paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
-paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
-paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
-paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
-paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
 paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
 paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
  foreach(src ${windows_symbolic_SRCS})
-  get_filename_component(src ${src} NAME_WE)
+    get_filename_component(src ${src} NAME_WE)
-  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
-      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
-  endif()
+    endif()
-#only copy the xx.cu to.xx.cu when the content are modified
+    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
-  set(copy_flag 1)
-  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
+    add_custom_command(OUTPUT ${final_path}/.${src}.cu
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
+            COMMENT "create hidden file of ${src}.cu")
-  if (SOURCE_STR STREQUAL TARGET_STR)
+    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
-    set(copy_flag 0)
-  endif()
-  endif()
-  if (copy_flag)
-  add_custom_command(OUTPUT .${src}.cu
-          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
-          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
-          COMMENT "create hidden file of ${src}.cu")
-  endif(copy_flag)
-  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
  endforeach()
 endfunction()
@@ -48,10 +38,10 @@ if(WITH_GPU)
    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
    add_dependencies(tensor tensor_util)
  else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
  endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
 endif()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -84,6 +74,7 @@ cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)

--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -165,7 +165,7 @@ template <typename T>
 class GreaterThanChecker {
 public:
  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
  }
@@ -177,7 +177,7 @@ template <typename T>
 class EqualGreaterThanChecker {
 public:
  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
  }
@@ -193,7 +193,7 @@ class DefaultValueSetter {
 public:
  explicit DefaultValueSetter(T default_value)
      : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }  // NOLINT
+  void operator()(T* value) const { *value = default_value_; }
 private:
  T default_value_;
@@ -203,7 +203,7 @@ template <typename T>
 class EnumInContainer {
 public:
  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
-  void operator()(T& val) const {
+  void operator()(const T& val) const {
    PADDLE_ENFORCE(container_.find(val) != container_.end(),
                   "Value %s is not in enum container %s", val,
                   ContainerDebugString());
@@ -232,7 +232,8 @@ class EnumInContainer {
 // an attribute can have more than one limits
 template <typename T>
 class TypedAttrChecker {
-  typedef std::function<void(T&)> ValueChecker;
+  typedef std::function<void(T*)> DefaultValueChecker;
+  typedef std::function<void(const T&)> ValueChecker;
 public:
  explicit TypedAttrChecker(const std::string& attr_name)
@@ -268,17 +269,17 @@ class TypedAttrChecker {
    return *this;
  }
-  void operator()(AttributeMap& attr_map) const {  // NOLINT
+  void operator()(AttributeMap* attr_map) const {
-    if (!attr_map.count(attr_name_)) {
+    if (!attr_map->count(attr_name_)) {
      // user do not set this attr
      PADDLE_ENFORCE(!default_value_setter_.empty(),
                     "Attribute '%s' is required!", attr_name_);
      // default_value_setter_ has no more than one element
      T val;
-      (default_value_setter_[0])(val);
+      (default_value_setter_[0])(&val);
-      attr_map[attr_name_] = val;
+      (*attr_map)[attr_name_] = val;
    }
-    Attribute& attr = attr_map.at(attr_name_);
+    Attribute& attr = attr_map->at(attr_name_);
    ExtractAttribute<T> extract_attr(attr_name_);
    T* attr_value = extract_attr(attr);
    for (const auto& checker : value_checkers_) {
@@ -289,12 +290,12 @@ class TypedAttrChecker {
 private:
  std::string attr_name_;
  std::vector<ValueChecker> value_checkers_;
-  std::vector<ValueChecker> default_value_setter_;
+  std::vector<DefaultValueChecker> default_value_setter_;
 };
 // check whether op's all attributes fit their own limits
 class OpAttrChecker {
-  typedef std::function<void(AttributeMap&)> AttrChecker;
+  typedef std::function<void(AttributeMap*)> AttrChecker;
 public:
  template <typename T>
@@ -304,7 +305,7 @@ class OpAttrChecker {
    return *(checker.target<TypedAttrChecker<T>>());
  }
-  void Check(AttributeMap& attr_map) const {  // NOLINT
+  void Check(AttributeMap* attr_map) const {
    for (const auto& checker : attr_checkers_) {
      checker(attr_map);
    }

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  ClearFetchOp(graph_.get(), &fetch_ops);
  return fetches;
 }
 void FastThreadedSSAGraphExecutor::RunOpAsync(
    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
    OpHandleBase *op,

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -355,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
          BuildStrategy::GradientScaleStrategy::kCustomized) {
        // TODO(paddle-dev): Why is there no input for this op_handle?
        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
+        auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
+        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
+                              out_dtype);
      }
      // This assumes the backward generating code will ensure IsScaleLossOp
      // is true only for the op that scale the final scalar loss.
@@ -658,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
    ir::Graph *result, const std::string &loss_grad_name,
-    ir::Node *out_var_node) const {
+    ir::Node *out_var_node, proto::VarType::Type dtype) const {
  for (size_t i = 0; i < places_.size(); ++i) {
    // Insert ScaleCost OpHandle
    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
    auto *op_handle = new ScaleLossGradOpHandle(
        result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx);
+        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype);
    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
    // FIXME: Currently ScaleLossGradOp only use device_count as scale

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void CreateScaleLossGradOp(ir::Graph *result,
                             const std::string &loss_grad_name,
-                             ir::Node *out_var_node) const;
+                             ir::Node *out_var_node,
+                             proto::VarType::Type dtype) const;
  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                            int dst_dev_id) const;

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -22,39 +22,66 @@ namespace details {
 ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
                                             Scope *scope,
                                             platform::Place place,
-                                             platform::DeviceContext *dev_ctx)
+                                             platform::DeviceContext *dev_ctx,
+                                             proto::VarType::Type dtype)
    : OpHandleBase(node),
      coeff_(static_cast<float>(1.0 / num_dev)),
      scope_(scope),
-      place_(place) {
+      place_(place),
+      out_dtype_(dtype) {
  this->SetDeviceContext(place_, dev_ctx);
 }
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
+struct ScaleLossGradFunctor {
+  float coeff_;
+  Tensor *out_;
+  platform::Place place_;
+  OpHandleBase *op_handle_;
+  proto::VarType::Type out_dtype_;
+  platform::DeviceContext *ctx_;
+  ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
+                       OpHandleBase *op_handle, proto::VarType::Type dtype,
+                       platform::DeviceContext *ctx)
+      : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
+  template <typename OutT>
+  void apply() const {
+    auto *out_data = out_->mutable_data<OutT>(place_);
+    if (platform::is_cpu_place(place_)) {
+      *out_data = static_cast<OutT>(coeff_);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      OutT cast_coeff = static_cast<OutT>(coeff_);
+      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
+                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
+                   stream);
+      VLOG(10) << place_ << "RUN Scale loss grad op";
+#endif
+    }
+  }
+};
 void ScaleLossGradOpHandle::RunImpl() {
  // Doesn't wait any event
  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  float *tmp = local_scope.FindVar(var_name)
+  auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
-                   ->GetMutable<LoDTensor>()
+  tensor->Resize(make_ddim({1}));
-                   ->mutable_data<float>(make_ddim({1}), place_);
-  if (platform::is_cpu_place(place_)) {
-    *tmp = coeff_;
-  } else {
 #ifdef PADDLE_WITH_CUDA
-    this->RunAndRecordEvent([&] {
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
-      auto stream = static_cast<platform::CUDADeviceContext *>(
+                            this->dev_ctxes_.at(place_));
-                        this->dev_ctxes_.at(place_))
+  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
-                        ->stream();
+#else
-      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
-                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
+  framework::VisitDataType(out_dtype_, func);
-      VLOG(10) << place_ << "RUN Scale loss grad op";
-    });
 #endif
-  }
 }
 std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -26,8 +26,8 @@ namespace details {
 struct ScaleLossGradOpHandle : public OpHandleBase {
  ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
-                        platform::Place place,
+                        platform::Place place, platform::DeviceContext *context,
-                        platform::DeviceContext *context);
+                        proto::VarType::Type dtype);
  ~ScaleLossGradOpHandle() final;
@@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
  float coeff_;
  Scope *scope_;
  platform::Place place_;
+  proto::VarType::Type out_dtype_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc(
    const std::string& output) {
  auto proto = base_desc;
  framework::OpDesc desc(proto, nullptr);
+  desc.SetType("conv2d_fusion");
  desc.SetInput("Bias", {bias});
  desc.SetInput("ResidualData", {bias1});
  desc.SetAttr("activation", activation);
  desc.SetOutput("Output", {output});
  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_cudnn", false);
+  desc.Flush();
  return *desc.Proto();
 }
 std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  const std::string pattern_name = "conv_elementwise_add2_act_fuse";
  FusePassBase::Init(pattern_name, graph.get());
  GraphPatternDetector gpd;
@@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    framework::OpDesc new_op_desc(new_op_proto, nullptr);
    // Create a new node for the fused op.
-    graph->CreateOpNode(&new_op_desc);
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
    // Link inputs and outputs.
    PADDLE_ENFORCE(subgraph.count(x));
    auto* conv_in_node = subgraph.at(x);
-    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
-    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
-    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(),
+    GraphSafeRemoveNodes(
-                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+        graph.get(),
-                          elementwise_add_out});
+        {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
+         elementwise_add_out, elementwise_add_out_1, act_op});
  };
  gpd(graph.get(), handler);
  return graph;

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
  return out_var;
 }
-std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
+std::unordered_set<std::string> conv_act_set({"identity", "relu"});
-                                              "relu6", "relux", "tanh",
-                                              "band_pass"});
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
  conv_in->AsInput();
@@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
                                  ->AsInput();
  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
                                 ->assert_is_op_output("elementwise_add")
-                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->assert_is_op_input("elementwise_add", "Y")
                                 ->AsIntermediate();
  auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
                                  ->assert_is_op("elementwise_add");
  auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
-                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->assert_is_op_input("elementwise_add", "X")
                                    ->AsInput();
  auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
                                   ->assert_is_op_output("elementwise_add")
@@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
  conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
      .LinksTo({elementwise_add_out});
-  elementwise_add_op_1->LinksFrom(
+  elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
-      {elementwise_add_out, elementwise_add_in_y_1});
+      .LinksTo({elementwise_add_out_1});
  act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
  return act_out;
 }

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -157,13 +157,8 @@ bool CheckLoD(const LoD &in, int tensor_height) {
    if (level.size() < 2) return false;
    // check: the first offset(the begin offset) of each level should be 0.
    if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(no same items
+    // check: all the offsets in a level should be ascending(allow same items)
-    // allows).
+    if (!std::is_sorted(level.begin(), level.end())) {
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      LOG(INFO) << "ascending error";
      return false;
    }
  }

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -217,6 +217,11 @@ TEST(LoD, CheckLoD) {
  // check with underlying tensor storage.
  ASSERT_TRUE(CheckLoD(relative_lod, 5));
  ASSERT_FALSE(CheckLoD(relative_lod, 9));
+  // check whether lod is ascending-sorted (allow same items)
+  ASSERT_TRUE(CheckLoD({{0, 1, 2, 3, 4, 5}}, 5));
+  ASSERT_TRUE(CheckLoD({{0, 1, 3, 3, 4, 5}}, 5));
+  ASSERT_FALSE(CheckLoD({{0, 1, 3, 2, 5}}, 5));
 }
 TEST(LoD, CheckAbsLoD) {

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -215,8 +215,8 @@ class Vector {
      auto stream = dev_ctx->stream();
      void *src = gpu_->ptr();
      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                   gpu_->size(), stream);
+                           gpu_->size(), stream);
      dev_ctx->Wait();
    }
@@ -261,8 +261,8 @@ class Vector {
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          platform::DeviceContextPool::Instance().Get(place));
      auto stream = dev_ctx->stream();
-      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                   gpu_->size(), stream);
+                           gpu_->size(), stream);
    }
    void ImmutableCPU() const {
@@ -284,7 +284,7 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }
    mutable std::vector<T> cpu_;
-    mutable memory::AllocationPtr gpu_;
+    mutable paddle::memory::AllocationPtr gpu_;
    mutable int flag_;
    mutable std::mutex mtx_;

--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -31,10 +31,12 @@ std::map<std::string,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
    NgraphBridge::NG_NODE_MAP = {
+        {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
        {"mul", paddle::operators::ngraphs::BuildMulNode},
        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
-        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}};
+        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
+        {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
 void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
  auto& op_type = op->Type();

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() {
    // not by users.
    return;
  }
-  checker->Check(attrs_);
+  checker->Check(&attrs_);
 }
 void OpDesc::InferShape(const BlockDesc &block) const {

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -123,6 +123,8 @@ class OpDesc {
  BlockDesc *Block() { return this->block_; }
+  const BlockDesc *Block() const { return this->block_; }
 private:
  template <typename MapType>
  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {

--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -24,7 +24,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
    const VariableNameMap& outputs, AttributeMap attrs) {
  auto& info = OpInfoMap::Instance().Get(type);
  if (info.Checker() != nullptr) {
-    info.Checker()->Check(attrs);
+    info.Checker()->Check(&attrs);
  }
  auto op = info.Creator()(type, inputs, outputs, attrs);
  return std::unique_ptr<OperatorBase>(op);

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,7 +23,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
-#include "glog/logging.h"  // For VLOG()
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"               // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/framework.pb.h"

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -181,11 +181,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 }
 bool OperatorBase::HasInputs(const std::string& name) const {
-  if (inputs_.find(name) != inputs_.end()) {
+  return inputs_.find(name) != inputs_.end();
-    return true;
-  } else {
-    return false;
-  }
 }
 std::string OperatorBase::Input(const std::string& name) const {
@@ -476,6 +472,28 @@ const Tensor* ExecutionContext::LegacyInput<Tensor>(
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  if (it == ctx_.inputs.end()) {
+    return {};
+  }
+  const std::vector<Variable*>& vars = it->second;
+  std::vector<const Tensor*> res;
+  res.reserve(vars.size());
+  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                 [&](Variable* var) -> const Tensor* {
+                   if (var == nullptr) return nullptr;
+                   PADDLE_ENFORCE(
+                       var->IsType<LoDTensor>(),
+                       "should be LoDTensor, but the received type is %s",
+                       var->Type().name());
+                   return &(var->Get<LoDTensor>());
+                 });
+  return res;
+}
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const {
  auto names = op().Inputs(name);
  std::vector<const Tensor*> res;
  res.reserve(names.size());
@@ -1039,8 +1057,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          t = &(var->Get<SelectedRows>().value());
        }
        if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
-                         ipt_name, DebugString());
+                         ipt_name);
          int tmp = static_cast<int>(t->type());
          PADDLE_ENFORCE(
              tmp == data_type || data_type == -1,

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@";
 /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
 constexpr char kGradVarSuffix[] = "@GRAD";
+constexpr size_t kGradVarSuffixSize = 5U;
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
@@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
 inline std::string GradVarName(const std::string& var_name) {
-  return var_name + kGradVarSuffix;
+  std::string result;
+  result.reserve(var_name.size() + kGradVarSuffixSize);
+  result += var_name;
+  result += kGradVarSuffix;
+  return result;
 }
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
@@ -110,8 +116,8 @@ class OperatorBase {
  bool HasAttr(const std::string& name) const { return attrs_.count(name); }
  template <typename T>
  inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+    PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(),
-                   name);
+                   "%s should be in AttributeMap", name);
    return boost::get<T>(attrs_.at(name));
  }
  const AttributeMap& Attrs() const { return attrs_; }
@@ -197,8 +203,31 @@ class ExecutionContext {
  const std::vector<const Variable*> MultiInputVar(
      const std::string& name) const {
-    auto names = op_.Inputs(name);
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
    std::vector<const Variable*> res;
+    res.reserve(it->second.size());
+    std::transform(it->second.begin(), it->second.end(),
+                   std::back_inserter(res),
+                   [this](Variable* var) { return var; });
+    return res;
+  }
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    return it->second;
+  }
+  const std::vector<Variable*> LegacyMultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<Variable*> res;
    res.reserve(names.size());
    std::transform(names.begin(), names.end(), std::back_inserter(res),
                   [this](const std::string& name) {
@@ -208,7 +237,7 @@ class ExecutionContext {
    return res;
  }
-  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
    auto names = op_.Outputs(name);
    std::vector<Variable*> res;
    res.reserve(names.size());
@@ -250,6 +279,38 @@ class ExecutionContext {
  template <typename T>
  const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<const T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> const T* {
+                     return var == nullptr ? nullptr : &var->Get<T>();
+                   });
+    return res;
+  }
+  template <typename T>
+  std::vector<T*> MultiOutput(const std::string& name) const {
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> T* {
+                     return var == nullptr ? nullptr : var->GetMutable<T>();
+                   });
+    return res;
+  }
+  template <typename T>
+  const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
    auto names = op_.Inputs(name);
    std::vector<const T*> res;
    res.reserve(names.size());
@@ -262,7 +323,7 @@ class ExecutionContext {
  }
  template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
+  std::vector<T*> LegacyMultiOutput(const std::string& name) const {
    auto names = op_.Outputs(name);
    std::vector<T*> res;
    res.reserve(names.size());
@@ -321,6 +382,10 @@ template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const;
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const;
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;

--- a/paddle/fluid/framework/scope_pool.cc
+++ b/paddle/fluid/framework/scope_pool.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/threadpool.h"
+namespace paddle {
+namespace framework {
+ScopePool &ScopePool::Instance() {  // NOLINT
+  static ScopePool pool;
+  return pool;
+}
+void ScopePool::DeleteScope(Scope *scope) { delete scope; }
+void ScopePool::Insert(std::unique_ptr<Scope> &&s) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  scopes_.insert(s.release());
+}
+void ScopePool::Remove(Scope *s) {
+  size_t has_scope;
+  {
+    std::lock_guard<std::mutex> guard(mtx_);
+    has_scope = scopes_.erase(s);
+  }
+  PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope");
+  DeleteScope(s);
+}
+ScopePool::~ScopePool() { Clear(); }
+void ScopePool::Clear() {
+  std::lock_guard<std::mutex> guard(mtx_);
+  for (auto *s : scopes_) {
+    DeleteScope(s);
+  }
+  scopes_.clear();
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/scope_pool.h
+++ b/paddle/fluid/framework/scope_pool.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <mutex>  // NOLINT
+#include <unordered_set>
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+class ScopePool {
+ public:
+  static ScopePool &Instance();  // NOLINT
+  void Insert(std::unique_ptr<Scope> &&s);
+  void Remove(Scope *s);
+  void Clear();
+  ~ScopePool();
+ private:
+  ScopePool() = default;
+  static void DeleteScope(Scope *scope);
+  std::unordered_set<Scope *> scopes_;
+  std::mutex mtx_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -25,6 +25,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+class OperatorBase;
 using InferShapeVarPtr = boost::variant<VarDesc *, Variable *>;
 class InferShapeContext {

--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 namespace paddle {
 namespace framework {
@@ -27,6 +28,8 @@ void Tensor::check_memory_size() const {
      "or maybe the required data-type mismatches the data already stored.");
 }
+Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
 size_t Tensor::memory_size() const {
  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
@@ -101,5 +104,12 @@ const DDim& Tensor::dims() const { return dims_; }
 int64_t Tensor::numel() const { return product(dims_); }
+void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
+  if (holder_) {
+    PADDLE_ENFORCE_EQ(numel() * SizeOfType(type()), holder->size());
+  }
+  holder_ = holder;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,6 +69,8 @@ class Tensor {
 public:
  Tensor() : type_(proto::VarType::FP32), offset_(0) {}
+  explicit Tensor(const proto::VarType::Type&);
  /*! Return a pointer to mutable memory block. */
  template <typename T>
  T* data();
@@ -162,6 +164,8 @@ class Tensor {
    return std::move(holder_);
  }
+  void ResetHolder(std::shared_ptr<memory::Allocation> holder);
 private:
  /*! holds the memory block if allocated. */
  std::shared_ptr<memory::Allocation> holder_;

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 namespace paddle {
 namespace framework {
@@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
               src_ptr, size);
 }
+template <typename T>
+paddle::framework::Tensor GetTensor(
+    memory::allocation::AllocationPtr temp_allocation_ptr,
+    const framework::DDim& dim) {
+  auto& deleter = temp_allocation_ptr.get_deleter();
+  auto* allocation_ptr = temp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
+  PADDLE_ENFORCE(
+      dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+      "The AllocationPtr must be TemporaryAllocation.");
+  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                    framework::product(dim) * sizeof(T));
+  paddle::framework::Tensor temp_tensor(
+      framework::ToDataType(std::type_index(typeid(T))));
+  temp_tensor.Resize(dim);
+  temp_tensor.ResetHolder(std::move(shared_allocation));
+  return temp_tensor;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -69,17 +69,17 @@ void TestWord2vecPrediction(const std::string& model_path) {
  std::vector<PaddleTensor> outputs;
  CHECK(predictor->Run(slots, &outputs));
-  PADDLE_ENFORCE(outputs.size(), 1UL);
+  PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
  // Check the output buffer size and result of each tid.
-  PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
  float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                     0.000932706};
  const size_t num_elements = outputs.front().data.length() / sizeof(float);
  // The outputs' buffers are in CPU memory.
  for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
       i++) {
-    LOG(INFO) << "data: "
+    LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
-              << static_cast<float*>(outputs.front().data.data())[i];
+              << " result: " << result[i];
    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                   result[i]);
  }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                  inputs[i].data.length());
    } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
+                   inputs[i].data.length(), dev_ctx->stream());
-                   0);  // stream 0 for sync copy
 #else
      PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                  inputs[i].data.length());
    } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
+                   inputs[i].data.length(), dev_ctx->stream());
-                   0);  // stream 0 for sync copy
 #else
      PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -89,12 +89,21 @@ endif()
 if(WITH_MKL)
  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+  if(NOT WIN32)
-               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+                 ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX}
+            ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif(WIN32)
  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
  if(EXISTS ${MKLDNN_PATH})
    include_directories("${MKLDNN_PATH}/include")
-    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    if(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+    else(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    endif(WIN32)
  endif()
 else()
  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -75,6 +75,11 @@ set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
 download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
+# MM DNN
+set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn")
+download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc)
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
@@ -103,6 +108,10 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
+# seq_pool1
+inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1
+"${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz")
 # mobilenet with depthwise_conv op
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")

--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -254,5 +254,16 @@ TEST(Analyzer_dam, compare) { compare(); }
 TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
+// Compare Deterministic result
+TEST(Analyzer_dam, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -180,6 +180,17 @@ TEST(Analyzer_LAC, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+// Compare Deterministic result
+TEST(Analyzer_LAC, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+using contrib::AnalysisConfig;
+struct DataRecord {
+  std::vector<std::vector<int64_t>> query_data_all, title_data_all;
+  std::vector<size_t> lod1, lod2;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= query_data_all.size()) {
+      data.query_data_all.assign(query_data_all.begin() + batch_iter,
+                                 query_data_all.begin() + batch_end);
+      data.title_data_all.assign(title_data_all.begin() + batch_iter,
+                                 title_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod1.push_back(0);
+      data.lod2.push_back(0);
+      CHECK(!data.query_data_all.empty());
+      CHECK(!data.title_data_all.empty());
+      CHECK_EQ(data.query_data_all.size(), data.title_data_all.size());
+      for (size_t j = 0; j < data.query_data_all.size(); j++) {
+        // calculate lod
+        data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
+        data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, '\t', &data);
+      // load query data
+      std::vector<int64_t> query_data;
+      split_to_int64(data[0], ' ', &query_data);
+      // load title data
+      std::vector<int64_t> title_data;
+      split_to_int64(data[1], ' ', &title_data);
+      query_data_all.push_back(std::move(query_data));
+      title_data_all.push_back(std::move(title_data));
+    }
+    num_samples = num_lines;
+  }
+};
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_query_tensor, lod_title_tensor;
+  lod_query_tensor.name = "left";
+  lod_title_tensor.name = "right";
+  auto one_batch = data->NextBatch();
+  int size1 = one_batch.lod1[one_batch.lod1.size() - 1];  // token batch size
+  int size2 = one_batch.lod2[one_batch.lod2.size() - 1];  // token batch size
+  lod_query_tensor.shape.assign({size1, 1});
+  lod_query_tensor.lod.assign({one_batch.lod1});
+  lod_title_tensor.shape.assign({size2, 1});
+  lod_title_tensor.lod.assign({one_batch.lod2});
+  // assign data
+  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all);
+  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all);
+  // Set inputs.
+  input_slots->assign({lod_query_tensor, lod_title_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+void SetConfig(contrib::AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
+  }
+}
+// Easy for profiling independently.
+TEST(Analyzer_MM_DNN, profile) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    PADDLE_ENFORCE_EQ(outputs.size(), 2UL);
+    for (auto &output : outputs) {
+      size_t size = GetSize(output);
+      PADDLE_ENFORCE_GT(size, 0);
+      float *result = static_cast<float *>(output.data.data());
+      // output is probability, which is in (-1, 1).
+      for (size_t i = 0; i < size; i++) {
+        EXPECT_GT(result[i], -1);
+        EXPECT_LT(result[i], 1);
+      }
+    }
+  }
+}
+// Check the fuse status
+TEST(Analyzer_MM_DNN, fuse_statis) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+}
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_MM_DNN, compare) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+// Compare Deterministic result
+TEST(Analyzer_MM_DNN, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -179,5 +179,16 @@ TEST(Analyzer_Chinese_ner, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+// Compare Deterministic result
+TEST(Analyzer_Chinese_ner, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -85,6 +85,17 @@ TEST(Analyzer_resnet50, compare) { compare(); }
 TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
+// Compare Deterministic result
+TEST(Analyzer_resnet50, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -265,6 +265,17 @@ TEST(Analyzer_rnn1, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+// Compare Deterministic result
+TEST(Analyzer_rnn1, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 // Test Multi-Thread.
 TEST(Analyzer_rnn1, multi_thread) {
  contrib::AnalysisConfig cfg;

--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -158,5 +158,16 @@ TEST(Analyzer_rnn2, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+// Compare Deterministic result
+TEST(Analyzer_rnn2, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -204,5 +204,16 @@ TEST(Analyzer_seq_conv1, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+// Compare Deterministic result
+TEST(Analyzer_seq_conv1, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->param_file = FLAGS_infer_model + "/params";
+  cfg->prog_file = FLAGS_infer_model + "/model";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->enable_ir_optim = true;
+  cfg->specify_input_name = true;
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+}
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  std::vector<std::string> feed_names = {
+      "slot10000_embed", "slot10001_embed", "slot10004_embed",
+      "slot10005_embed", "slot10008_embed", "slot10009_embed",
+      "slot10012_embed", "slot10013_embed", "slot10108_embed",
+      "slot13324_embed", "slot13325_embed", "slot13326_embed",
+      "slot13327_embed", "slot13328_embed", "slot13329_embed",
+      "slot13330_embed", "slot13331_embed", "slot15501_embed",
+      "slot15502_embed", "slot15503_embed", "slot15504_embed",
+      "slot15505_embed", "slot15506_embed", "slot15507_embed",
+      "slot15508_embed", "slot15516_embed", "slot15519_embed",
+      "slot15523_embed", "slot15531_embed", "slot15533_embed",
+      "slot15548_embed", "slot15564_embed", "slot15565_embed",
+      "slot15566_embed", "slot15570_embed", "slot15571_embed",
+      "slot15572_embed", "slot15573_embed", "slot15574_embed",
+      "slot15575_embed", "slot15576_embed", "slot15577_embed",
+      "slot15579_embed", "slot15581_embed", "slot15582_embed",
+      "slot15583_embed", "slot15584_embed", "slot5016_embed",
+      "slot5021_embed",  "slot6002_embed",  "slot6003_embed",
+      "slot6004_embed",  "slot6005_embed",  "slot6006_embed",
+      "slot6007_embed",  "slot6008_embed",  "slot6009_embed",
+      "slot6011_embed",  "slot6014_embed",  "slot6015_embed",
+      "slot6023_embed",  "slot6024_embed",  "slot6025_embed",
+      "slot6027_embed",  "slot6029_embed",  "slot6031_embed",
+      "slot6034_embed",  "slot6035_embed",  "slot6036_embed",
+      "slot6037_embed",  "slot6039_embed",  "slot6048_embed",
+      "slot6050_embed",  "slot6058_embed",  "slot6059_embed",
+      "slot6060_embed",  "slot6066_embed",  "slot6067_embed",
+      "slot6068_embed",  "slot6069_embed",  "slot6070_embed",
+      "slot6071_embed",  "slot6072_embed",  "slot6073_embed",
+      "slot6182_embed",  "slot6183_embed",  "slot6184_embed",
+      "slot6185_embed",  "slot6186_embed",  "slot6188_embed",
+      "slot6189_embed",  "slot6190_embed",  "slot6201_embed",
+      "slot6202_embed",  "slot6203_embed",  "slot6247_embed",
+      "slot6248_embed",  "slot6250_embed",  "slot6251_embed",
+      "slot6807_embed",  "slot6808_embed",  "slot6809_embed",
+      "slot6810_embed",  "slot6811_embed",  "slot6812_embed",
+      "slot6813_embed",  "slot6814_embed",  "slot6815_embed",
+      "slot6816_embed",  "slot6817_embed",  "slot6818_embed",
+      "slot6819_embed",  "slot6820_embed",  "slot6822_embed",
+      "slot6823_embed",  "slot6826_embed",  "slot7002_embed",
+      "slot7003_embed",  "slot7004_embed",  "slot7005_embed",
+      "slot7006_embed",  "slot7008_embed",  "slot7009_embed",
+      "slot7010_embed",  "slot7011_embed",  "slot7013_embed",
+      "slot7014_embed",  "slot7015_embed",  "slot7016_embed",
+      "slot7017_embed",  "slot7019_embed",  "slot7100_embed",
+      "slot7506_embed",  "slot7507_embed",  "slot7514_embed",
+      "slot7515_embed",  "slot7516_embed"};
+  SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params",
+                    &feed_names);
+}
+// Easy for profiling independently.
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
+  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+TEST(Analyzer_seq_pool1, profile) { profile(); }
+// Check the fuse status
+TEST(Analyzer_seq_pool1, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+  LOG(INFO) << "num_ops: " << num_ops;
+  EXPECT_EQ(num_ops, 314);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -106,6 +106,17 @@ TEST(Analyzer_Text_Classification, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+// Compare Deterministic result
+TEST(Analyzer_Text_Classification, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
  AnalysisConfig cfg;
  SetConfig(&cfg);

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -145,6 +145,17 @@ TEST(Analyzer_vis, compare) { compare(); }
 TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
+// Compare Deterministic result
+TEST(Analyzer_vis, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -45,6 +45,7 @@ DEFINE_bool(use_analysis, true,
            "Running the inference program in analysis mode.");
 DEFINE_bool(record_benchmark, false,
            "Record benchmark after profiling the model");
+DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
 DECLARE_bool(profile);
 DECLARE_int32(paddle_num_threads);
@@ -85,7 +86,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
        float *pdata = static_cast<float *>(out.data.data());
        float *pdata_ref = static_cast<float *>(ref_out.data.data());
        for (size_t j = 0; j < size; ++j) {
-          EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
+          EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy);
        }
        break;
      }
@@ -131,7 +132,8 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
                       const std::string &dirname, bool is_combined = true,
                       std::string model_filename = "model",
-                       std::string params_filename = "params") {
+                       std::string params_filename = "params",
+                       const std::vector<std::string> *feed_names = nullptr) {
  // Set fake_image_data
  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
@@ -145,26 +147,32 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
    os << "}\n";
  }
  LOG(INFO) << os.str();
+  if (feed_names) {
-  int dim1 = feed_target_shapes[0][1];
+    PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size());
-  int dim2 = feed_target_shapes[0][2];
+  }
-  int dim3 = feed_target_shapes[0][3];
+  std::vector<PaddleTensor> input_slots(feed_target_shapes.size());
+  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
-  PaddleTensor input;
+    const auto &feed_shape = feed_target_shapes[i];
-  std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
+    auto &input = input_slots[i];
-  input.shape = shape;
+    std::vector<int> shape({FLAGS_batch_size});
-  input.dtype = PaddleDType::FLOAT32;
+    for (size_t s = 1; s < feed_shape.size(); ++s) {
+      shape.push_back(static_cast<int>(feed_shape[s]));
-  // fill input data, for profile easily, do not use random data here.
+    }
-  size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
+    if (feed_names) {
-  input.data.Resize(size * sizeof(float));
+      input.name = (*feed_names)[i];
-  float *input_data = static_cast<float *>(input.data.data());
+    }
-  for (size_t i = 0; i < size; i++) {
+    input.shape = shape;
-    *(input_data + i) = static_cast<float>(i) / size;
+    input.dtype = PaddleDType::FLOAT32;
+    size_t len = std::accumulate(shape.begin(), shape.end(), 1,
+                                 [](int a, int b) { return a * b; });
+    input.data.Resize(len * sizeof(float));
+    input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
+    float *input_data = static_cast<float *>(input.data.data());
+    // fill input data, for profile easily, do not use random data here.
+    for (size_t j = 0; j < len; ++j) {
+      *(input_data + j) = static_cast<float>(j) / len;
+    }
  }
-  std::vector<PaddleTensor> input_slots;
-  input_slots.assign({input});
  (*inputs).emplace_back(input_slots);
 }
@@ -283,6 +291,26 @@ void TestPrediction(const PaddlePredictor::Config *config,
  }
 }
+void CompareDeterministic(
+    const PaddlePredictor::Config *config,
+    const std::vector<std::vector<PaddleTensor>> &inputs) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  auto predictor = CreateTestPredictor(config, FLAGS_use_analysis);
+  // warmup run
+  std::vector<PaddleTensor> warmup_outputs, outputs;
+  predictor->Run(inputs[0], &warmup_outputs, batch_size);
+  // run num_times to Compare Deterministic Result.
+  for (int i = 0; i < num_times; i++) {
+    for (size_t j = 0; j < inputs.size(); j++) {
+      predictor->Run(inputs[j], &outputs, batch_size);
+      CompareResult(outputs, warmup_outputs);
+    }
+  }
+}
 void CompareNativeAndAnalysis(
    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs) {

--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -3,14 +3,16 @@ set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
    "A path setting inference demo download directories.")
 function (inference_download install_dir url filename)
    message(STATUS "Download inference test stuff from ${url}/${filename}")
-    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
+    file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
    message(STATUS "finish downloading ${filename}")
 endfunction()
 function (inference_download_and_uncompress install_dir url filename)
    inference_download(${install_dir} ${url} ${filename})
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+    execute_process(
+            COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename}
+            WORKING_DIRECTORY ${install_dir}
+    )
 endfunction()
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -16,6 +16,7 @@ add_subdirectory(metrics)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
+add_subdirectory(jit)
 if(WITH_DISTRIBUTE)
    add_subdirectory(distributed)
@@ -42,11 +43,10 @@ if (WITH_DISTRIBUTE)
    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
 endif()
-register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 # warpctc_op needs cudnn 7 above
-if (WITH_GPU AND NOT WIN32)
+if (WITH_GPU)
    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
    else()
@@ -65,7 +65,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
@@ -92,4 +92,8 @@ cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
+if (WITH_PYTHON)
+  cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
+endif()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -22,7 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search);
 namespace paddle {
 namespace operators {
-#if CUDNN_VERSION >= 7001
+#if CUDNN_VERSION >= 7100
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@@ -204,7 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
-#if CUDNN_VERSION >= 7001
+#if CUDNN_VERSION >= 7100
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
                        ops::CUDNNConvFusionOpKernel<double>);

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
@@ -123,6 +124,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    auto& dev_ctx = context.template device_context<DeviceContext>();
    const int batch_size = static_cast<int>(input->dims()[0]);
    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
@@ -155,13 +158,16 @@ class GemmConvKernel : public framework::OpKernel<T> {
    // to call the matrix multiplication interface.
    Tensor col_matrix;
    if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
      col_matrix.ShareDataWith(col);
      col_matrix.Resize(col_matrix_shape);
    }
-    framework::DDim input_shape = framework::slice_ddim(
+    framework::DDim input_shape =
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
@@ -178,7 +184,6 @@ class GemmConvKernel : public framework::OpKernel<T> {
    math::Vol2ColFunctor<DeviceContext, T> vol2col;
    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
    for (int i = 0; i < batch_size; i++) {
      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -237,6 +242,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    const int batch_size = static_cast<int>(input->dims()[0]);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
@@ -262,8 +269,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    framework::DDim col_matrix_shape =
        framework::flatten_to_2d(col_shape, data_dim + 1);
-    framework::DDim input_shape = framework::slice_ddim(
+    framework::DDim input_shape =
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
@@ -286,13 +293,15 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    // to call the matrix multiplication interface.
    Tensor col_matrix;
    if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
      col_matrix.ShareDataWith(col);
      col_matrix.Resize(col_matrix_shape);
    }
    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
    if (input_grad) {

--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
@@ -82,10 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    Tensor track;
    int* track_value =
        track.mutable_data<int>(emission_dims, platform::CPUPlace());
-    const auto& ker = math::jitkernel::KernelPool::Instance()
+    auto ker = jit::Get<jit::kCRFDecoding, jit::CRFDecodingTuples<T>,
-                          .template Get<math::jitkernel::CRFDecodeKernel<T>>(
+                        platform::CPUPlace>(tag_num);
-                              static_cast<int>(tag_num));
+    ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num);
-    ker->Compute(static_cast<int>(seq_len), x, w, alpha_value, track_value);
    T max_score = -std::numeric_limits<T>::max();
    int max_i = 0;
    for (size_t i = 0; i < tag_num; ++i) {

--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <array>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"

--- a/paddle/fluid/operators/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/dequantize_mkldnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/dequantize_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using platform::to_void_cast;
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+template <typename T>
+class DeQuantOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto scale_data = ctx.Attr<float>("Scale");
+    auto* output = ctx.Output<Tensor>("Output");
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& engine = dev_ctx.GetEngine();
+    const T* input_data = input->data<T>();
+    float* output_data = output->mutable_data<float>(ctx.GetPlace());
+    std::vector<float> reorder_scale = {1.0f / scale_data};
+    std::vector<primitive> pipeline;
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    mkldnn::memory::data_type src_dt =
+        paddle::framework::ToMKLDNNDataType(input->type());
+    mkldnn::memory::format src_fmt = input->format();
+    mkldnn::primitive_attr attri;
+    int mask = 0;
+    attri.set_output_scales(mask, reorder_scale);
+    auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
+    auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
+    auto src_memory =
+        std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
+    std::shared_ptr<primitive::at> src_memory_p =
+        std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
+    auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32,
+                                          memory::format::nchw);
+    auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
+    auto dst_memory = mkldnn::memory(dst_pd, to_void_cast<float>(output_data));
+    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
+        new reorder::primitive_desc(src_pd, dst_pd, attri));
+    auto reorder_p = std::shared_ptr<reorder>(
+        new reorder(*reorder_pd, *src_memory_p, dst_memory));
+    pipeline.push_back(*reorder_p);
+    stream(stream::kind::eager).submit(pipeline).wait();
+    output->set_format(GetMKLDNNFormat(dst_memory));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>);
--- a/paddle/fluid/operators/dequantize_op.cc
+++ b/paddle/fluid/operators/dequantize_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/dequantize_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+framework::OpKernelType DeQuantOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
+  framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_);
+}
+void DeQuantOpMaker::Make() {
+  AddInput("Input", "input data");
+  AddOutput("Output", "output data");
+  AddAttr<float>("Scale", "scale data").SetDefault({1.0f});
+  AddComment(R"DOC(This op will dequantize data from INT8 to FP32)DOC");
+}
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
--- a/paddle/fluid/operators/dequantize_op.h
+++ b/paddle/fluid/operators/dequantize_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using framework::OpKernelType;
+using framework::Tensor;
+class DeQuantOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
+    ctx->ShareLoD("Input", /*->*/ "Output");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+class DeQuantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+class DeQuantGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
@@ -25,7 +25,7 @@ namespace detail {
 */
 template <typename T, typename... ARGS>
 inline T& Ref(T* ptr, ARGS&&... args) {
-  PADDLE_ENFORCE(ptr != nullptr, args...);
+  PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...));
  return *ptr;
 }

--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
    vars->mutable_data<T>(ctx.GetPlace());
    framework::Tensor d_temp;
-    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+    framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
    // At least use 32 threads, at most 512 threads.
    // blockx is multiple of 32.
    int blockx = std::min(
-        static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L);
+        static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
+        512L);
    int gridx = (feature_width * num_priors + blockx - 1) / blockx;
    dim3 threads(blockx, 1);
    dim3 grids(gridx, feature_height);

--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -84,7 +84,9 @@ class ProtoEncodeHelper {
  ~ProtoEncodeHelper() {
 #define REPLACE_ENFORCE_GLOG 1
    // Make sure callers didn't do operations that went over max_size promised
-    paddle::platform::throw_on_error(p_ <= limit_);
+    if (paddle::platform::is_error(p_ <= limit_)) {
+      paddle::platform::throw_on_error(p_ <= limit_);
+    }
 #undef REPLACE_ENFORCE_GLOG
  }

--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -33,7 +33,7 @@ register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
 if(WITH_GPU AND NOT WIN32)
    set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
-    op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common)
+    op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common)
 endif()
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)

--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    elementwise_div,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_div_grad,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -16,11 +16,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
+#ifdef PADDLE_WITH_XBYAK
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -81,8 +84,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
    UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
    UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
-    Xbyak::util::Cpu cpu;
+    const bool is_avx512_enabled = platform::MayIUse(platform::avx512f);
-    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
    const bool are_dims_divisable = !(x_int_dims[1] % 16);
    const bool is_x_format_correct = x->format() == memory::format::nChw16c;
    const bool is_y_format_correct = y->format() == memory::format::nc;
@@ -108,10 +110,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
        constexpr int simd_width = 16;
        int C = c / simd_width;
-        const auto& multiply =
+        auto multiply = jit::Get<jit::kNCHW16CMulNC, jit::NCHW16CMulNCTuples<T>,
-            math::jitkernel::KernelPool::Instance()
+                                 platform::CPUPlace>(0);
-                .template Get<math::jitkernel::EltwiseMulnChw16cNCKernel<T>>(n);
 #pragma omp parallel for collapse(2)
        for (int ni = 0; ni < n; ni++) {
          for (int ci = 0; ci < C; ci++) {
@@ -122,7 +122,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
            auto ptr_z =
                z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-            multiply->Compute(ptr_x, ptr_y, ptr_z, h, w);
+            multiply(ptr_x, ptr_y, ptr_z, h, w);
          }
        }
      }

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul,
+    elementwise_mul, ops::ElementwiseMulKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
-                                  int64_t>);
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
@@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL(
    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::float16>,
    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 namespace paddle {
@@ -182,27 +182,29 @@ class FusionGRUKernel : public framework::OpKernel<T> {
  const int total_T = x_dims[0];           \
  const int D3 = wh_dims[1]
-#define INIT_OTHER_DEFINES                                         \
+#define INIT_OTHER_DEFINES                                                     \
-  auto* h0 = ctx.Input<Tensor>("H0");                              \
+  auto* h0 = ctx.Input<Tensor>("H0");                                          \
-  auto* wx = ctx.Input<Tensor>("WeightX");                         \
+  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
-  auto* bias = ctx.Input<Tensor>("Bias");                          \
+  auto* bias = ctx.Input<Tensor>("Bias");                                      \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");              \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");                  \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
-  const int M = x_dims[1];                                         \
+  const int M = x_dims[1];                                                     \
-  const int D = wh_dims[0];                                        \
+  const int D = wh_dims[0];                                                    \
-  const int D2 = D * 2;                                            \
+  const int D2 = D * 2;                                                        \
-  const math::jitkernel::gru_attr_t attr(                          \
+  const jit::gru_attr_t attr(                                                  \
-      D, ctx.Attr<std::string>("gate_activation"),                 \
+      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),         \
-      ctx.Attr<std::string>("activation"));                        \
+      jit::to_kerneltype(ctx.Attr<std::string>("activation")));                \
-  math::jitkernel::gru_t one_step;                                 \
+  jit::gru_t one_step;                                                         \
-  const auto& ker =                                                \
+  auto ComputeH1 =                                                             \
-      math::jitkernel::KernelPool::Instance()                      \
+      jit::Get<jit::kGRUH1, jit::GRUTuples<T>, platform::CPUPlace>(attr);      \
-          .template Get<math::jitkernel::GRUKernel<T>,             \
+  auto ComputeHtPart1 =                                                        \
-                        const math::jitkernel::gru_attr_t&>(attr); \
+      jit::Get<jit::kGRUHtPart1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
-  const T* x_data = x->data<T>();                                  \
+  auto ComputeHtPart2 =                                                        \
-  const T* wx_data = wx->data<T>();                                \
+      jit::Get<jit::kGRUHtPart2, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
-  const T* wh_data = wh->data<T>();                                \
+  const T* x_data = x->data<T>();                                              \
-  auto place = ctx.GetPlace();                                     \
+  const T* wx_data = wx->data<T>();                                            \
+  const T* wh_data = wh->data<T>();                                            \
+  auto place = ctx.GetPlace();                                                 \
  T* xx_data = xx->mutable_data<T>(place)
  void SeqCompute(const framework::ExecutionContext& ctx) const {
@@ -241,7 +243,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
      } else {
        one_step.gates = xx_data;
        one_step.ht = hidden_out_data;
-        ker->ComputeH1(&one_step, &attr);
+        ComputeH1(&one_step, &attr);
        prev_hidden_data = hidden_out_data;
        tstart = 1;
        move_step();
@@ -254,12 +256,12 @@ class FusionGRUKernel : public framework::OpKernel<T> {
        one_step.gates = xx_data;
        one_step.ht_1 = prev_hidden_data;
        one_step.ht = hidden_out_data;
-        ker->ComputeHtPart1(&one_step, &attr);
+        ComputeHtPart1(&one_step, &attr);
        // gemm rt * Ws
        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
                  hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
                  xx_data + D2, D3);
-        ker->ComputeHtPart2(&one_step, &attr);
+        ComputeHtPart2(&one_step, &attr);
        // save prev
        prev_hidden_data = hidden_out_data;
        move_step();
@@ -323,7 +325,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
      for (int i = 0; i < max_bs; ++i) {
        one_step.gates = cur_in_data;
        one_step.ht = cur_out_data;
-        ker->ComputeH1(&one_step, &attr);
+        ComputeH1(&one_step, &attr);
        // add offset
        cur_in_data += D3;
        cur_out_data += D;
@@ -351,7 +353,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
        one_step.gates = cur_batched_data;
        one_step.ht_1 = cur_prev_hidden_data;
        one_step.ht = cur_out_data;
-        ker->ComputeHtPart1(&one_step, &attr);
+        ComputeHtPart1(&one_step, &attr);
        cur_batched_data += D3;
        cur_prev_hidden_data += D;
@@ -369,7 +371,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
        one_step.gates = cur_batched_data;
        one_step.ht_1 = cur_prev_hidden_data;
        one_step.ht = cur_out_data;
-        ker->ComputeHtPart2(&one_step, &attr);
+        ComputeHtPart2(&one_step, &attr);
        cur_batched_data += D3;
        cur_prev_hidden_data += D;
        cur_out_data += D;

--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
 #include <string>
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 namespace paddle {
@@ -235,31 +235,32 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  const int D = wh_dims[0];                                 \
  const int D4 = wh_dims[1]
-#define INIT_OTHER_DEFINES                                      \
+#define INIT_OTHER_DEFINES                                                    \
-  const T* x_data = x->data<T>();                               \
+  const T* x_data = x->data<T>();                                             \
-  const T* wx_data = wx->data<T>();                             \
+  const T* wx_data = wx->data<T>();                                           \
-  const T* wh_data = wh->data<T>();                             \
+  const T* wh_data = wh->data<T>();                                           \
-  /* diagonal weight*/                                          \
+  /* diagonal weight*/                                                        \
-  const T* wp_data = bias->data<T>() + D4;                      \
+  const T* wp_data = bias->data<T>() + D4;                                    \
-  /* for peephole only*/                                        \
+  /* for peephole only*/                                                      \
-  T* checked_cell_data = nullptr;                               \
+  T* checked_cell_data = nullptr;                                             \
-  auto place = ctx.GetPlace();                                  \
+  auto place = ctx.GetPlace();                                                \
-  if (use_peepholes) {                                          \
+  if (use_peepholes) {                                                        \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/            \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                          \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");     \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                   \
-    checked_cell_data = checked_cell->mutable_data<T>(place);   \
+    checked_cell_data = checked_cell->mutable_data<T>(place);                 \
-  }                                                             \
+  }                                                                           \
-  const math::jitkernel::lstm_attr_t attr(                      \
+  const jit::lstm_attr_t attr(                                                \
-      D, ctx.Attr<std::string>("gate_activation"),              \
+      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),        \
-      ctx.Attr<std::string>("candidate_activation"),            \
+      jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")),      \
-      ctx.Attr<std::string>("cell_activation"), use_peepholes); \
+      jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")),           \
-  math::jitkernel::lstm_t one_step;                             \
+      use_peepholes);                                                         \
-  one_step.wp = wp_data;                                        \
+  jit::lstm_t one_step;                                                       \
-  one_step.checked = checked_cell_data;                         \
+  one_step.wp = wp_data;                                                      \
-  const auto& ker =                                             \
+  one_step.checked = checked_cell_data;                                       \
-      math::jitkernel::KernelPool::Instance()                   \
+  auto ComputeC1H1 =                                                          \
-          .template Get<math::jitkernel::LSTMKernel<T>,         \
+      jit::Get<jit::kLSTMC1H1, jit::LSTMTuples<T>, platform::CPUPlace>(attr); \
-                        const math::jitkernel::lstm_attr_t&>(attr)
+  auto ComputeCtHt =                                                          \
+      jit::Get<jit::kLSTMCtHt, jit::LSTMTuples<T>, platform::CPUPlace>(attr)
 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
@@ -305,7 +306,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
        one_step.gates = xx_data;
        one_step.ct = c_out_data;
        one_step.ht = h_out_data;
-        ker->ComputeC1H1(&one_step, &attr);
+        ComputeC1H1(&one_step, &attr);
        tstart = 1;
        // move one step
        prev_h_data = h_out_data;
@@ -321,7 +322,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
        one_step.ct_1 = prev_c_data;
        one_step.ct = c_out_data;
        one_step.ht = h_out_data;
-        ker->ComputeCtHt(&one_step, &attr);
+        ComputeCtHt(&one_step, &attr);
        // move one step
        prev_h_data = h_out_data;
        prev_c_data = c_out_data;
@@ -401,7 +402,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
        one_step.gates = cur_in_data;
        one_step.ct = cur_c_out_data;
        one_step.ht = cur_h_out_data;
-        ker->ComputeC1H1(&one_step, &attr);
+        ComputeC1H1(&one_step, &attr);
        cur_in_data += D4;
        cur_c_out_data += D;
@@ -431,7 +432,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
        one_step.ct_1 = cur_prev_c_data;
        one_step.ct = cur_c_out_data;
        one_step.ht = cur_h_out_data;
-        ker->ComputeCtHt(&one_step, &attr);
+        ComputeCtHt(&one_step, &attr);
        // move one batch
        cur_in_data += D4;

--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
+set(jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h)
+file(WRITE ${jit_file} "// Generated by the paddle/fluid/operators/jit/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${jit_file} "\#pragma once\n")
+file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
+file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place)
+file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
+cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
+# refer must go first
+add_subdirectory(refer)
+add_subdirectory(more)
+if(WITH_XBYAK)
+    add_subdirectory(gen)
+endif()
+cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
+cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
+if(NOT WIN32)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer)
+endif()
--- a/paddle/fluid/operators/jit/README.en.md
+++ b/paddle/fluid/operators/jit/README.en.md
+# JIT Kernel
+JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
+Each implementations has its own condition to use, defined in `UseMe`.
+They are combined together to get the best performance of one single independent function.
+They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
+And they can be composed with some other exited jit kernels to build up a complex function. 
+Currently it's only supported on CPU yet.
+## Contents
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+All basical definations of jit kernels are addressed in `paddle/fluid/operators/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality.
+- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries.
+- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak.
+- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. 
+## How to use
+One simple function `jit::Get`, which is very easy to use, is supported to get the kernel.
+It can automatically return the expected function with best performance under the given attributes. 
+All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, you can only include this one header to get all the registered kernels.
+## Solid Test
+- Unit Test
+    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
+- Benchmark
+    All functions should be tested, and make sure the `jit::Get` function obtain the best performance with all attributes.
+# How to add new kernel
+## Required
+1. Add `your_key` at `KernelType`.
+2. Add reference function of `your_key`. 
+Note:
+    - this should be run on CPU and do not depend on any third-party.
+    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+3. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
+Test more data type for some special functions if necessary, for example `int8`.
+4. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `jit::Get` always get the best one.
+## Optional
+Add more implementations of `your_kery` for performance enhancement.
+1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corepsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
+Note: Add new `KernelTuples` if necessary，your can refer to `XYZNTuples`.
+Specialie method `JitCodeKey` when add new attribute type。
+2. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
+# JIT Kernel
+结合函数模板和JIT生成需要的kernel函数。
+这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。可以有多重第三方库的实现，每种实现有自己的`UseMe`函数负责什么条件下可以被调用。
+这里实现的函数可以非常细粒度的函数方法，比如Vector MUL， 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。
+目前仅支持CPU上的高性能计算。
+## 目录结构
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+基本类的定义都放在根目录下，根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现，每种kernel算子都需要有reference的实现，用作单元测试的基准，其他的实现都是可选的。
+- gen: 代表使用jit生成的code，需要依赖xbyak库。该实现最关心的就是性能。
+- refer: 代表reference的实现，每种kernel算子都需要有在CPU上的reference的实现，他主要关心的算法逻辑的正确性。
+- more: 下面可以放入跟多实现，可以包括mkl，mkldnn，intrinsic，openblas等，也可以是自身已有的kernel组合。
+## 动态获取
+提供一个`jit::Get`方法，根据kernel类别获取，每种实现都有自己的使用范围，根据范围动态和当前条件选择需要的kernel函数。
+## 测试
+- 逻辑测试
+    所有实现都要与refer的code对比，需要满足精度要求， 包括float和double的数据类型
+- 性能测试
+    所有实现的性能对比，并且与最终的`jit::Get`方法对比，该方法拿到的性能需要在各种条件下都是最好的。
+# 如何添加新的算子
+- 在`KernelType` 中添加 `your_key` .
+- 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel.
+- (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
+- (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
+- 必要时可以添加新的`KernelTuples`，可以参考`XYZNTuples`，新加的Attr类型需要特例化`JitCodeKey`方法。
+- 在`test.cc`中添加unit test，至少需要测试`float`和`double`两种数据类型，如有必要需要支持额外的数据类型，比如`int8`的相关函数。
+- 在`benchmark.cc`中添加相应的性能对比，同一种kernel需要对比所有实现，并且确保`jit::Get`得到的实现一直是速度最快的。
+# 优点
+- 统一的Get方法，接口简单。
+- 同一套逻辑可以有多套实现，可以依赖多套第三方库，互不影响。
+- 目录结构清晰，不会在某个文件中有多个宏定义，导致的可读性差问题。
+- 优化方便，可以直接针对某种属性针对性优化，并不影响其他属性下的性能。
+- 可以支持多种平台，包括Linux，Mac 和 Windows，至少可以保证每种平台都可以正常work。后期也可以针对不同平台有针对的优化。框架层面可以使用统一接口，不必关心底层实现。
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/port.h"
+DEFINE_int32(burning, 10, "Burning times.");
+DEFINE_int32(repeat, 3000, "Repeat times.");
+DEFINE_int32(max_size, 1000, "The Max size would be tested.");
+template <typename T>
+void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
+               const T upper = static_cast<T>(20.f), unsigned int seed = 100) {
+  std::mt19937 rng(seed);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+std::vector<int> TestSizes() {
+  std::vector<int> s;
+  for (int i = 1; i <= FLAGS_max_size; ++i) {
+    s.push_back(i);
+  }
+  return s;
+}
+template <typename KernelTuples, typename... Args>
+struct BenchFunc {
+  // return this function avg time
+  double operator()(const typename KernelTuples::func_type tgt, Args... args) {
+    for (int i = 0; i < FLAGS_burning; ++i) {
+      tgt(args...);
+    }
+    auto start = paddle::platform::PosixInNsec() / 1e-3;
+    for (int i = 0; i < FLAGS_repeat; ++i) {
+      tgt(args...);
+    }
+    auto end = paddle::platform::PosixInNsec() / 1e-3;
+    return static_cast<double>(end - start) / FLAGS_repeat;
+  }
+};
+namespace jit = paddle::operators::jit;
+template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
+          typename... Args>
+void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
+  BenchFunc<KernelTuples, Args...> benchmark;
+  std::vector<std::pair<std::string, double>> infos;
+  // test refer
+  auto refer = jit::GetRefer<KT, KernelTuples>();
+  if (!refer) {
+    LOG(FATAL) << "Refer can not be empty!";
+  }
+  infos.push_back(std::make_pair("Refer", benchmark(refer, args...)));
+  // test jitcode
+  auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
+  if (jitcode) {
+    infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...)));
+  }
+  // test all impls in more
+  jit::KernelKey kkey(KT, PlaceType());
+  auto& pool = jit::KernelPool().Instance().AllKernels();
+  auto iter = pool.find(kkey);
+  if (iter != pool.end()) {
+    auto& impls = iter->second;
+    for (auto& impl : impls) {
+      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
+      if (i && i->UseMe(attr)) {
+        auto more = i->GetFunc();
+        infos.push_back(
+            std::make_pair(i->ImplType(), benchmark(more, args...)));
+      }
+    }
+  }
+  // Test result from Get function
+  auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
+  if (!tgt) {
+    LOG(FATAL) << "Target can not be empty!";
+  }
+  infos.push_back(std::make_pair("Target", benchmark(tgt, args...)));
+  // print
+  std::ostringstream loginfos;
+  loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": ";
+  for (auto pair : infos) {
+    loginfos << pair.first << " takes " << pair.second << " us; ";
+  }
+  LOG(INFO) << loginfos.str();
+}
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchXYZNKernel() {
+  for (int d : TestSizes()) {
+    std::vector<T> x(d), y(d), z(d);
+    RandomVec<T>(d, x.data());
+    RandomVec<T>(d, y.data());
+    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data(), y.data(),
+                                                     z.data(), d);
+  }
+}
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchAXYNKernel() {
+  for (int d : TestSizes()) {
+    const T a = static_cast<T>(3);
+    std::vector<T> x(d), y(d);
+    RandomVec<T>(d, x.data());
+    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data(), y.data(),
+                                                     d);
+  }
+}
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchXYNKernel() {
+  for (int d : TestSizes()) {
+    std::vector<T> x(d), y(d);
+    RandomVec<T>(d, x.data());
+    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data(), y.data(), d);
+  }
+}
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchLSTMKernel() {
+  for (bool use_peephole : {true, false}) {
+    for (int d : TestSizes()) {
+      const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
+                                  use_peephole);
+      std::vector<T> x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d);
+      RandomVec<T>(4 * d, x.data(), -2.f, 2.f);
+      RandomVec<T>(3 * d, wp.data(), -2.f, 2.f);
+      RandomVec<T>(d, ct_1.data(), -2.f, 2.f);
+      const T* ct_1_data = ct_1.data();
+      const T* wp_data = wp.data();
+      T* x_data = x.data();
+      T* checked_data = checked.data();
+      T* ct_data = ct.data();
+      T* ht_data = ht.data();
+      jit::lstm_t step;
+      step.gates = x_data;
+      step.ct_1 = ct_1_data;
+      step.ct = ct_data;
+      step.ht = ht_data;
+      if (use_peephole) {
+        step.wp = wp_data;
+        step.checked = checked_data;
+      }
+      BenchAllImpls<KT, jit::LSTMTuples<T>, PlaceType>(attr, &step, &attr);
+    }
+  }
+}
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchGRUKernel() {
+  for (int d : TestSizes()) {
+    const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
+    std::vector<T> x(3 * d), ht_1(d), ht(d);
+    RandomVec<T>(3 * d, x.data(), -2.f, 2.f);
+    RandomVec<T>(d, ht_1.data(), -2.f, 2.f);
+    const T* ht_1_data = ht_1.data();
+    T* x_data = x.data();
+    T* ht_data = ht.data();
+    jit::gru_t step;
+    step.gates = x_data;
+    step.ht_1 = ht_1_data;
+    step.ht = ht_data;
+    BenchAllImpls<KT, jit::GRUTuples<T>, PlaceType>(attr, &step, &attr);
+  }
+}
+// Benchmark all jit kernels including jitcode, mkl and refer.
+// To use this tool, run command: ./benchmark [options...]
+// Options:
+//     --burning: the burning time before count
+//     --repeat: the repeat times
+//     --max_size: the max size would be tested
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  google::InitGoogleLogging(argv[0]);
+  LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
+            << " times.";
+  using T = float;
+  using PlaceType = paddle::platform::CPUPlace;
+  // xyzn
+  BenchXYZNKernel<jit::kVMul, T, PlaceType>();
+  BenchXYZNKernel<jit::kVAdd, T, PlaceType>();
+  BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>();
+  BenchXYZNKernel<jit::kVSub, T, PlaceType>();
+  // axyn
+  BenchAXYNKernel<jit::kVScal, T, PlaceType>();
+  BenchAXYNKernel<jit::kVAddBias, T, PlaceType>();
+  // xyn
+  BenchXYNKernel<jit::kVRelu, T, PlaceType>();
+  BenchXYNKernel<jit::kVIdentity, T, PlaceType>();
+  BenchXYNKernel<jit::kVExp, T, PlaceType>();
+  BenchXYNKernel<jit::kVSigmoid, T, PlaceType>();
+  BenchXYNKernel<jit::kVTanh, T, PlaceType>();
+  // lstm and peephole
+  BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>();
+  BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>();
+  // gru functions
+  BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
+  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
+  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
+}
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
+file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
+set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
+function(USE_JITKERNEL_GEN TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
+endfunction()
+# use gen jitcode kernel by name
+USE_JITKERNEL_GEN(kVMul)
+USE_JITKERNEL_GEN(kVAdd)
+#USE_JITKERNEL_GEN(kVSub) # TODO(TJ): enable me
+USE_JITKERNEL_GEN(kVAddRelu)
+USE_JITKERNEL_GEN(kVScal)
+USE_JITKERNEL_GEN(kVAddBias)
+USE_JITKERNEL_GEN(kVRelu)
+USE_JITKERNEL_GEN(kVIdentity)
+USE_JITKERNEL_GEN(kVExp)
+USE_JITKERNEL_GEN(kVSigmoid)
+USE_JITKERNEL_GEN(kVTanh)
+USE_JITKERNEL_GEN(kLSTMCtHt)
+USE_JITKERNEL_GEN(kLSTMC1H1)
+USE_JITKERNEL_GEN(kGRUH1)
+USE_JITKERNEL_GEN(kGRUHtPart1)
+USE_JITKERNEL_GEN(kGRUHtPart2)
+USE_JITKERNEL_GEN(kNCHW16CMulNC)
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/gen/act.h"
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
+    REPEAT_8TIMES(1.f),
+    REPEAT_8TIMES(2.f),
+    REPEAT_8TIMES(0.5f),
+    REPEAT_8TIMES(EXP_HIG),
+    REPEAT_8TIMES(EXP_LOW),
+    REPEAT_8TIMES(CEPHES_LOG2EF),
+    REPEAT_8TIMES(CEPHES_EXP_C1),
+    REPEAT_8TIMES(CEPHES_EXP_C2),
+    REPEAT_8TIMES(CEPHES_EXP_P0),
+    REPEAT_8TIMES(CEPHES_EXP_P1),
+    REPEAT_8TIMES(CEPHES_EXP_P2),
+    REPEAT_8TIMES(CEPHES_EXP_P3),
+    REPEAT_8TIMES(CEPHES_EXP_P4),
+    REPEAT_8TIMES(CEPHES_EXP_P5),
+    REPEAT_8TIMES(EXP_MAX_INPUT),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
+const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
+int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
+void VActJitCode::genCode() {
+  int offset = 0;
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    vmovups(ymm_src, ptr[param1 + offset]);
+    act<ymm_t>(ymm_dst, ymm_src, type_);
+    vmovups(ptr[param2 + offset], ymm_dst);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  int rest = num_ % YMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
+    if (rest >= 4) {
+      block = 4;
+      vmovups(xmm_src, ptr[param1 + offset]);
+    } else if (rest >= 2) {
+      block = 2;
+      vmovq(xmm_src, ptr[param1 + offset]);
+    } else {
+      block = 1;
+      vmovss(xmm_src, ptr[param1 + offset]);
+    }
+    act<xmm_t>(xmm_dst, xmm_src, type_);
+    if (rest >= 4) {
+      vmovups(ptr[param2 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param2 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param2 + offset], xmm_dst);
+    }
+    offset += sizeof(float) * block;
+    rest -= block;
+  }
+  ret();
+}
+#define DECLARE_ACT_CREATOR(name)                                            \
+  class name##Creator : public JitCodeCreator<int> {                         \
+   public:                                                                   \
+    bool UseMe(const int& attr) const override {                             \
+      return platform::MayIUse(platform::avx);                               \
+    }                                                                        \
+    size_t CodeSize(const int& d) const override;                            \
+    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
+    }                                                                        \
+  }
+DECLARE_ACT_CREATOR(VRelu);
+DECLARE_ACT_CREATOR(VIdentity);
+DECLARE_ACT_CREATOR(VExp);
+DECLARE_ACT_CREATOR(VSigmoid);
+DECLARE_ACT_CREATOR(VTanh);
+// TODO(TJ): tuning use me
+size_t VReluCreator::CodeSize(const int& d) const {
+  return 96 /* init size */ +
+         (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
+             8 /* average bytes for each instruction */;
+}
+size_t VIdentityCreator::CodeSize(const int& d) const {
+  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
+}
+size_t VExpCreator::CodeSize(const int& d) const {
+  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 70 * 8;
+}
+size_t VSigmoidCreator::CodeSize(const int& d) const {
+  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 82 * 8;
+}
+size_t VTanhCreator::CodeSize(const int& d) const {
+  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 84 * 8;
+}
+#undef DECLARE_ACT_CREATOR
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+namespace gen = paddle::operators::jit::gen;
+REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
+REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
+REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
+REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
+REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
-Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+ * You may obtain a copy of the License at
+ *
-http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
-Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-limitations under the License. */
+ * limitations under the License. */
 #pragma once
 #include <string>
-#include "paddle/fluid/operators/math/jit_gen.h"
+#include "glog/logging.h"
-#include "paddle/fluid/operators/math/jit_kernel_impl.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
-#include "paddle/fluid/platform/cpu_info.h"
 namespace paddle {
 namespace operators {
-namespace math {
+namespace jit {
-namespace jitkernel {
 namespace gen {
-using reg64_t = const Xbyak::Reg64;
-using reg32_t = const Xbyak::Reg32;
-using xmm_t = const Xbyak::Xmm;
-using ymm_t = const Xbyak::Ymm;
-using zmm_t = const Xbyak::Zmm;
-using Label = Xbyak::Label;
-typedef enum {
-  mul = 0,
-  add,
-  sub,
-  relu,
-  exp,
-  sigmoid,
-  tanh,
-  identity
-} operand_type;
 extern const float exp_float_consts[];
 extern const int exp_int_0x7f[];
 extern int g_tmp_mem[];
@@ -79,94 +59,15 @@ extern int g_tmp_mem[];
 #define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
 #define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
-// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
+class VActFunc : public JitCode {
-class VXXJitCode : public JitCode {
- public:
-  const char* name() const override {
-    std::string base = "VXXJitCode";
-    if (scalar_index_ == 1) {
-      base += "_Scalar";
-    } else {
-      base += "_Vec";
-    }
-    if (type_ == operand_type::mul) {
-      base += "_Mul";
-    } else if (type_ == operand_type::add) {
-      base += "_Add";
-    }
-    if (scalar_index_ == 2) {
-      base += "_Scalar";
-    } else {
-      base += "_Vec";
-    }
-    base += (with_relu_ ? "_Relu" : "");
-    return base.c_str();
-  }
-  explicit VXXJitCode(int d, operand_type type, int scalar_index,
-                      bool with_relu, size_t code_size = 256 * 1024,
-                      void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr),
-        num_(d),
-        type_(type),
-        scalar_index_(scalar_index),
-        with_relu_(with_relu) {}
-  static bool init(int d, int scalar_index = 0);
-  void generate() override;
- private:
-  int num_;
-  operand_type type_;
-  int scalar_index_;
-  bool with_relu_;
-  reg64_t param1{abi_param1};
-  reg64_t param2{abi_param2};
-  reg64_t param3{abi_param3};
-  xmm_t xmm_src1 = xmm_t(0);
-  xmm_t xmm_src2 = xmm_t(1);
-  xmm_t xmm_dst = xmm_t(2);
-  xmm_t xmm_zero = xmm_t(3);
-  ymm_t ymm_src1 = ymm_t(0);
-  ymm_t ymm_src2 = ymm_t(1);
-  ymm_t ymm_dst = ymm_t(2);
-  ymm_t ymm_zero = ymm_t(3);
-};
-class VActJitCode : public JitCode {
 public:
-  const char* name() const override {
+  explicit VActFunc(size_t code_size, void* code_ptr)
-    std::string base = "VActJitCode";
+      : JitCode(code_size, code_ptr) {}
-    switch (type_) {
+  virtual const char* name() const = 0;
-      case operand_type::relu:
+  virtual void genCode() = 0;
-        base += "_Relu";
-        break;
-      case operand_type::exp:
-        base += "_Exp";
-        break;
-      case operand_type::sigmoid:
-        base += "_Sigmoid";
-        break;
-      case operand_type::tanh:
-        base += "_Tanh";
-        break;
-      case operand_type::identity:
-        base += "_Identity";
-        break;
-      default:
-        break;
-    }
-    return base.c_str();
-  }
-  explicit VActJitCode(int d, operand_type type, size_t code_size = 256 * 1024,
-                       void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), num_(d), type_(type) {}
-  static bool init(int d, operand_type type);
-  void generate() override;
 protected:
-  // compute relu with ymm, xmm
+  // compute RELU with ymm, xmm
  template <typename JMM>
  void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) {  // NOLINT
    JMM zero = JMM(zero_idx);
@@ -174,7 +75,7 @@ class VActJitCode : public JitCode {
    vmaxps(dst, src, zero);
  }
-  // compute exp with ymm, xmm
+  // compute EXP with ymm, xmm
  template <typename JMM>
  void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
               int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
@@ -258,7 +159,7 @@ class VActJitCode : public JitCode {
    pop(reg_ptr_global);
  }
-  // compute sigmoid with ymm, xmm
+  // compute SIGMOID with ymm, xmm
  template <typename JMM>
  void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
                   int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
@@ -283,7 +184,7 @@ class VActJitCode : public JitCode {
    pop(reg_ptr_global);
  }
-  // compute tanh with ymm, xmm
+  // compute TANH with ymm, xmm
  template <typename JMM>
  void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
                int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
@@ -310,223 +211,109 @@ class VActJitCode : public JitCode {
    pop(reg_ptr_global);
  }
+  // compute IDENTITY with ymm, xmm
+  template <typename JMM>
+  void identity_jmm(JMM& dst, JMM& src, int zero_idx) {  // NOLINT
+    JMM zero = JMM(zero_idx);
+    vxorps(zero, zero, zero);
+    vaddps(dst, src, zero);
+    // TODO(TJ): use below
+    // dst.setIdx(src.getIdx());
+  }
  template <typename JMM>
  void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
    // use 11~15
    switch (type) {
-      case operand_type::relu:
+      case operand_type::RELU:
        relu_jmm<JMM>(dst, src, 15);
        break;
-      case operand_type::exp:
+      case operand_type::EXP:
        exp_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
        break;
-      case operand_type::sigmoid:
+      case operand_type::SIGMOID:
        sigmoid_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
        break;
-      case operand_type::tanh:
+      case operand_type::TANH:
        tanh_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
        break;
-      case operand_type::identity:
+      case operand_type::IDENTITY:
+        identity_jmm<JMM>(dst, src, 15);
        break;
      default:
-        // throw error
+        LOG(FATAL) << "Do not support this operand type: " << type;
        break;
    }
  }
- protected:
-  int num_;
-  operand_type type_;
-  reg64_t param1{abi_param1};
-  reg64_t param2{abi_param2};
-  xmm_t xmm_src = xmm_t(0);
-  ymm_t ymm_src = ymm_t(0);
-  xmm_t xmm_dst = xmm_t(1);
-  ymm_t ymm_dst = ymm_t(1);
 };
-class LSTMJitCode : public VActJitCode {
+class VActJitCode : public VActFunc {
 public:
-  const char* name() const override {
+  explicit VActJitCode(int d, operand_type type, size_t code_size,
-    std::string base = "LSTMJitCode";
+                       void* code_ptr = nullptr)
-    if (use_peephole_) {
+      : VActFunc(code_size, code_ptr), num_(d), type_(type) {
-      base += "_Peephole";
+    if (!(type_ == operand_type::RELU || type_ == operand_type::EXP ||
-    }
+          type_ == operand_type::SIGMOID || type_ == operand_type::TANH ||
-    if (compute_c1h1_) {
+          type_ == operand_type::IDENTITY)) {
-      base += "_C1H1";
+      LOG(FATAL) << "Do not support this operand type: " << type_;
    }
-    auto AddTypeStr = [&](operand_type type) {
+    this->genCode();
-      switch (type) {
-        case operand_type::relu:
-          base += "_Relu";
-          break;
-        case operand_type::exp:
-          base += "_Exp";
-          break;
-        case operand_type::sigmoid:
-          base += "_Sigmoid";
-          break;
-        case operand_type::tanh:
-          base += "_Tanh";
-          break;
-        case operand_type::identity:
-          base += "_Identity";
-          break;
-        default:
-          break;
-      }
-    };
-    AddTypeStr(act_gate_);
-    AddTypeStr(act_cand_);
-    AddTypeStr(act_cell_);
-    return base.c_str();
-  }
-  explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr,
-                       size_t code_size = 256 * 1024, void* code_ptr = nullptr)
-      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
-                    code_ptr),
-        compute_c1h1_(compute_c1h1) {
-    auto typeExchange = [](const std::string& type) -> gen::operand_type {
-      if (type == "sigmoid") {
-        return operand_type::sigmoid;
-      } else if (type == "relu") {
-        return operand_type::relu;
-      } else if (type == "tanh") {
-        return operand_type::tanh;
-      } else if (type == "identity" || type == "") {
-        return operand_type::identity;
-      }  // else throw error
-      return operand_type::identity;
-    };
-    num_ = attr.d;
-    use_peephole_ = attr.use_peephole;
-    act_gate_ = typeExchange(attr.act_gate);
-    act_cand_ = typeExchange(attr.act_cand);
-    act_cell_ = typeExchange(attr.act_cell);
  }
-  static bool init(int d);
-  void generate() override;
- protected:
-  int num_;
-  bool compute_c1h1_;
-  bool use_peephole_;
-  operand_type act_gate_;
-  operand_type act_cand_;
-  operand_type act_cell_;
-  reg64_t param1{abi_param1};
-};
-class GRUJitCode : public VActJitCode {
- public:
  const char* name() const override {
-    std::string base = "GRUJitCode";
+    std::string base = "VActJitCode";
-    if (id_ == 0) {
+    switch (type_) {
-      base += "_H1";
+      case operand_type::RELU:
-    } else if (id_ == 1) {
+        base += "_Relu";
-      base += "_HtPart1";
+        break;
-    } else if (id_ == 2) {
+      case operand_type::EXP:
-      base += "_HtPart2";
+        base += "_Exp";
+        break;
+      case operand_type::SIGMOID:
+        base += "_Sigmoid";
+        break;
+      case operand_type::TANH:
+        base += "_Tanh";
+        break;
+      case operand_type::IDENTITY:
+        base += "_Identity";
+        break;
+      default:
+        break;
    }
-    auto AddTypeStr = [&](operand_type type) {
-      switch (type) {
-        case operand_type::relu:
-          base += "_Relu";
-          break;
-        case operand_type::exp:
-          base += "_Exp";
-          break;
-        case operand_type::sigmoid:
-          base += "_Sigmoid";
-          break;
-        case operand_type::tanh:
-          base += "_Tanh";
-          break;
-        case operand_type::identity:
-          base += "_Identity";
-          break;
-        default:
-          break;
-      }
-    };
-    AddTypeStr(act_gate_);
-    AddTypeStr(act_cand_);
    return base.c_str();
  }
+  void genCode() override;
-  explicit GRUJitCode(int id, const gru_attr_t& attr,
-                      size_t code_size = 256 * 1024, void* code_ptr = nullptr)
-      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
-                    code_ptr),
-        id_(id) {
-    auto typeExchange = [](const std::string& type) -> gen::operand_type {
-      if (type == "sigmoid") {
-        return operand_type::sigmoid;
-      } else if (type == "relu") {
-        return operand_type::relu;
-      } else if (type == "tanh") {
-        return operand_type::tanh;
-      } else if (type == "identity" || type == "") {
-        return operand_type::identity;
-      }  // else throw error
-      return operand_type::identity;
-    };
-    num_ = attr.d;
-    act_gate_ = typeExchange(attr.act_gate);
-    act_cand_ = typeExchange(attr.act_cand);
-  }
-  static bool init(int d);
-  void generate() override;
 protected:
-  int id_;
  int num_;
-  operand_type act_gate_;
+  operand_type type_;
-  operand_type act_cand_;
  reg64_t param1{abi_param1};
-};
+  reg64_t param2{abi_param2};
-#ifdef PADDLE_WITH_MKLDNN
+  xmm_t xmm_src = xmm_t(0);
-struct EltwiseMulnChw16cNC : public Xbyak::CodeGenerator {
+  ymm_t ymm_src = ymm_t(0);
-  explicit EltwiseMulnChw16cNC(size_t code_size = 256 * 1024)
-      : Xbyak::CodeGenerator(code_size) {
-    // RDI is ptr x_input
-    // RSI is ptr y_input
-    // RDX is ptr output
-    // RCX is height
-    // r8 is width
-    push(rbx);
+  xmm_t xmm_dst = xmm_t(1);
+  ymm_t ymm_dst = ymm_t(1);
+};
-    xor_(rax, rax);
+#define DECLARE_ACT_JITCODE(name, op_type)                                    \
-    xor_(r10, r10);
+  class name##JitCode : public VActJitCode {                                  \
-    vmovups(zmm3, ptr[rsi]);
+   public:                                                                    \
+    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
+        : VActJitCode(d, op_type, code_size, code_ptr) {}                     \
+  };
-    L("h_loop");
+DECLARE_ACT_JITCODE(VRelu, operand_type::RELU);
-    xor_(rbx, rbx);
+DECLARE_ACT_JITCODE(VIdentity, operand_type::IDENTITY);
-    L("w_loop");
+DECLARE_ACT_JITCODE(VExp, operand_type::EXP);
-    vmovups(zmm2, ptr[rdi + rax]);
+DECLARE_ACT_JITCODE(VSigmoid, operand_type::SIGMOID);
-    vmulps(zmm1, zmm2, zmm3);
+DECLARE_ACT_JITCODE(VTanh, operand_type::TANH);
-    vmovups(ptr[rdx + rax], zmm1);
-    add(rax, 64);
-    inc(rbx);
-    cmp(r8, rbx);
-    jnz("w_loop");
-    inc(r10);
-    cmp(r10, rcx);
-    jnz("h_loop");
-    pop(rbx);
+#undef DECLARE_ACT_JITCODE
-    ret();
-  }
-};
-#endif
 }  // namespace gen
-}  // namespace jitkernel
+}  // namespace jit
-}  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/gen/blas.h"
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+void VXXJitCode::genCode() {
+  // do not need push stack, and do not need save avx512reg if do not use avx512
+  int offset = 0;
+  if (with_relu_) {
+    vxorps(ymm_zero, ymm_zero, ymm_zero);
+  }
+  if (scalar_index_ == 1) {
+    vbroadcastss(ymm_src1, ptr[param1]);
+  } else if (scalar_index_ == 2) {
+    vbroadcastss(ymm_src2, ptr[param2]);
+  }
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    if (scalar_index_ != 1) {
+      vmovups(ymm_src1, ptr[param1 + offset]);
+    }
+    if (scalar_index_ != 2) {
+      vmovups(ymm_src2, ptr[param2 + offset]);
+    }
+    if (type_ == operand_type::MUL) {
+      vmulps(ymm_dst, ymm_src1, ymm_src2);
+    } else if (type_ == operand_type::ADD) {
+      vaddps(ymm_dst, ymm_src1, ymm_src2);
+    }
+    if (with_relu_) {
+      vmaxps(ymm_dst, ymm_zero, ymm_dst);
+    }
+    vmovups(ptr[param3 + offset], ymm_dst);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  int rest = num_ % YMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
+    if (rest >= 4) {
+      block = 4;
+      if (scalar_index_ != 1) {
+        vmovups(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovups(xmm_src2, ptr[param2 + offset]);
+      }
+    } else if (rest >= 2) {
+      block = 2;
+      if (scalar_index_ != 1) {
+        vmovq(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovq(xmm_src2, ptr[param2 + offset]);
+      }
+    } else {
+      block = 1;
+      if (scalar_index_ != 1) {
+        vmovss(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovss(xmm_src2, ptr[param2 + offset]);
+      }
+    }
+    switch (type_) {
+      case operand_type::MUL:
+        vmulps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      case operand_type::ADD:
+        vaddps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      default:
+        break;
+    }
+    if (with_relu_) {
+      vmaxps(xmm_dst, xmm_zero, xmm_dst);
+    }
+    if (rest >= 4) {
+      vmovups(ptr[param3 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param3 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param3 + offset], xmm_dst);
+    }
+    offset += sizeof(float) * block;
+    rest -= block;
+  }
+  ret();
+}
+void NCHW16CMulNCJitCode::genCode() {
+  // RDI is ptr x_input
+  // RSI is ptr y_input
+  // RDX is ptr output
+  // RCX is height
+  // r8 is width
+  push(rbx);
+  xor_(rax, rax);
+  xor_(r10, r10);
+  vmovups(zmm3, ptr[rsi]);
+  L("h_loop");
+  xor_(rbx, rbx);
+  L("w_loop");
+  vmovups(zmm2, ptr[rdi + rax]);
+  vmulps(zmm1, zmm2, zmm3);
+  vmovups(ptr[rdx + rax], zmm1);
+  add(rax, 64);
+  inc(rbx);
+  cmp(r8, rbx);
+  jnz("w_loop");
+  inc(r10);
+  cmp(r10, rcx);
+  jnz("h_loop");
+  pop(rbx);
+  ret();
+}
+class NCHW16CMulNCCreator : public JitCodeCreator<int> {
+ public:
+  bool UseMe(const int& attr) const override {
+    return platform::MayIUse(platform::avx512f);
+  }
+  size_t CodeSize(const int& d) const override { return 256 * 1024; }
+  std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
+    return make_unique<NCHW16CMulNCJitCode>(attr, CodeSize(attr));
+  }
+};
+#define DECLARE_BLAS_CREATOR(name)                                           \
+  class name##Creator : public JitCodeCreator<int> {                         \
+   public:                                                                   \
+    bool UseMe(const int& attr) const override {                             \
+      return platform::MayIUse(platform::avx);                               \
+    }                                                                        \
+    size_t CodeSize(const int& d) const override {                           \
+      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
+    }                                                                        \
+    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
+    }                                                                        \
+  }
+DECLARE_BLAS_CREATOR(VMul);
+DECLARE_BLAS_CREATOR(VAdd);
+DECLARE_BLAS_CREATOR(VSub);
+DECLARE_BLAS_CREATOR(VAddRelu);
+DECLARE_BLAS_CREATOR(VScal);
+DECLARE_BLAS_CREATOR(VAddBias);
+#undef DECLARE_BLAS_CREATOR
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+namespace gen = paddle::operators::jit::gen;
+REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
+REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
+// TODO(TJ): enable sub
+// REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
+REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
+REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
+REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
+REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
+class VXXJitCode : public JitCode {
+ public:
+  explicit VXXJitCode(int d, operand_type type, int scalar_index,
+                      bool with_relu, size_t code_size = 256 * 1024,
+                      void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr),
+        num_(d),
+        type_(type),
+        scalar_index_(scalar_index),
+        with_relu_(with_relu) {
+    if (!(type_ == operand_type::MUL || type_ == operand_type::ADD)) {
+      LOG(FATAL) << "Do not support this operand type: " << type_;
+    }
+    this->genCode();
+  }
+  virtual const char* name() const {
+    std::string base = "VXXJitCode";
+    if (scalar_index_ == 1) {
+      base += "_Scalar";
+    } else {
+      base += "_Vec";
+    }
+    if (type_ == operand_type::MUL) {
+      base += "_Mul";
+    } else if (type_ == operand_type::ADD) {
+      base += "_Add";
+    }
+    if (scalar_index_ == 2) {
+      base += "_Scalar";
+    } else {
+      base += "_Vec";
+    }
+    base += (with_relu_ ? "_Relu" : "");
+    return base.c_str();
+  }
+  void genCode() override;
+ private:
+  int num_;
+  operand_type type_;
+  int scalar_index_;
+  bool with_relu_;
+  reg64_t param1{abi_param1};
+  reg64_t param2{abi_param2};
+  reg64_t param3{abi_param3};
+  xmm_t xmm_src1 = xmm_t(0);
+  xmm_t xmm_src2 = xmm_t(1);
+  xmm_t xmm_dst = xmm_t(2);
+  xmm_t xmm_zero = xmm_t(3);
+  ymm_t ymm_src1 = ymm_t(0);
+  ymm_t ymm_src2 = ymm_t(1);
+  ymm_t ymm_dst = ymm_t(2);
+  ymm_t ymm_zero = ymm_t(3);
+};
+#define DECLARE_BLAS_JITCODE(name, op_type, scalar_idx, with_relu)             \
+  class name##JitCode : public VXXJitCode {                                    \
+   public:                                                                     \
+    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr)  \
+        : VXXJitCode(d, op_type, scalar_idx, with_relu, code_size, code_ptr) { \
+    }                                                                          \
+  };
+DECLARE_BLAS_JITCODE(VMul, operand_type::MUL, 0, false);
+DECLARE_BLAS_JITCODE(VAdd, operand_type::ADD, 0, false);
+DECLARE_BLAS_JITCODE(VSub, operand_type::SUB, 0, false);
+DECLARE_BLAS_JITCODE(VAddRelu, operand_type::ADD, 0, true);
+DECLARE_BLAS_JITCODE(VScal, operand_type::MUL, 1, false);
+DECLARE_BLAS_JITCODE(VAddBias, operand_type::ADD, 1, false);
+#undef DECLARE_BLAS_JITCODE
+// nChw16c = nChw16c .* NC
+class NCHW16CMulNCJitCode : public JitCode {
+ public:
+  DECLARE_JIT_CODE(NCHW16CMulNCJitCode);
+  explicit NCHW16CMulNCJitCode(int d /*unused*/, size_t code_size,
+                               void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr) {
+    this->genCode();
+  }
+  void genCode() override;
+};
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/gen/gru.h"
+#include <stddef.h>  // offsetof
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+void GRUJitCode::genCode() {
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ht_1 = r9;
+  reg64_t reg_ptr_ht = r10;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]);
+  mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]);
+  ymm_t ymm_one = ymm_t(0);
+  if (id_ == 2) {
+    reg64_t reg_ptr_tmp = r11;
+    mov(reg_ptr_tmp, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
+  }
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    ymm_t ymm_u = ymm_t(1);
+    ymm_t ymm_r = ymm_t(2);
+    ymm_t ymm_s = ymm_t(3);
+    ymm_t ymm_ht_1 = ymm_t(4);
+    // W: {W_update, W_reset; W_state}
+    if (id_ == 0 || id_ == 2) {
+      vmovups(ymm_u, ptr[reg_ptr_gates + offset]);
+      vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]);
+    }
+    if (id_ == 1) {
+      vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]);
+    }
+    if (id_ == 1 || id_ == 2) {
+      vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]);
+    }
+    if (id_ == 0) {
+      // ht = act_gate(u) * act_cand(s)
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_s);
+    } else if (id_ == 1) {
+      // ht = act_gate(r) * ht_1
+      act<ymm_t>(ymm_r, ymm_r, act_gate_);
+      vmulps(ymm_r, ymm_r, ymm_ht_1);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_r);
+    } else if (id_ == 2) {
+      // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+      ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx());
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vsubps(ymm_u, ymm_one_inner, ymm_u);
+      vmulps(ymm_u, ymm_ht_1, ymm_u);
+      vaddps(ymm_u, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_u);
+    }
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  ret();
+}
+#define DECLARE_GRU_CREATOR(name)                                 \
+  class name##Creator : public JitCodeCreator<gru_attr_t> {       \
+   public:                                                        \
+    /* TODO(TJ): enable more */                                   \
+    bool UseMe(const gru_attr_t& attr) const override {           \
+      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
+    }                                                             \
+    size_t CodeSize(const gru_attr_t& attr) const override {      \
+      return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;          \
+    }                                                             \
+    std::unique_ptr<GenBase> CreateJitCode(                       \
+        const gru_attr_t& attr) const override {                  \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));    \
+    }                                                             \
+  }
+DECLARE_GRU_CREATOR(GRUH1);
+DECLARE_GRU_CREATOR(GRUHtPart1);
+DECLARE_GRU_CREATOR(GRUHtPart2);
+#undef DECLARE_GRU_CREATOR
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+namespace gen = paddle::operators::jit::gen;
+REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator);
+REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator);
+REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator);
--- a/paddle/fluid/operators/jit/gen/gru.h
+++ b/paddle/fluid/operators/jit/gen/gru.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/act.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+class GRUJitCode : public VActFunc {
+ public:
+  explicit GRUJitCode(int id, const gru_attr_t& attr, size_t code_size,
+                      void* code_ptr = nullptr)
+      : VActFunc(code_size, code_ptr), id_(id), num_(attr.d) {
+    auto typeExchange = [](KernelType type) -> gen::operand_type {
+      if (type == KernelType::kVSigmoid) {
+        return operand_type::SIGMOID;
+      } else if (type == KernelType::kVRelu) {
+        return operand_type::RELU;
+      } else if (type == KernelType::kVTanh) {
+        return operand_type::TANH;
+      } else if (type == KernelType::kVIdentity) {
+        return operand_type::IDENTITY;
+      } else {
+        LOG(FATAL) << "Do not support this jit::KernelType: " << type;
+      }
+      return operand_type::IDENTITY;
+    };
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+    this->genCode();
+  }
+  const char* name() const override {
+    std::string base = "GRUJitCode";
+    if (id_ == 0) {
+      base += "_H1";
+    } else if (id_ == 1) {
+      base += "_HtPart1";
+    } else if (id_ == 2) {
+      base += "_HtPart2";
+    }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::RELU:
+          base += "_Relu";
+          break;
+        case operand_type::EXP:
+          base += "_Exp";
+          break;
+        case operand_type::SIGMOID:
+          base += "_Sigmoid";
+          break;
+        case operand_type::TANH:
+          base += "_Tanh";
+          break;
+        case operand_type::IDENTITY:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    return base.c_str();
+  }
+  void genCode() override;
+ protected:
+  int id_;
+  int num_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  reg64_t param1{abi_param1};
+};
+#define DECLARE_GRU_JITCODE(name, id)                                \
+  class name##JitCode : public GRUJitCode {                          \
+   public:                                                           \
+    explicit name##JitCode(const gru_attr_t& attr, size_t code_size, \
+                           void* code_ptr = nullptr)                 \
+        : GRUJitCode(id, attr, code_size, code_ptr) {}               \
+  };
+DECLARE_GRU_JITCODE(GRUH1, 0);
+DECLARE_GRU_JITCODE(GRUHtPart1, 1);
+DECLARE_GRU_JITCODE(GRUHtPart2, 2);
+#undef DECLARE_GRU_JITCODE
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <type_traits>
+#include "paddle/fluid/operators/jit/gen_base.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#define XBYAK_USE_MMAP_ALLOCATOR
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+// Application Binary Interface
+constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
+    abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
+    abi_param4(Xbyak::Operand::RCX);
+constexpr Xbyak::Operand::Code g_abi_regs[] = {
+    Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
+    Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15};
+constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]);
+using reg64_t = const Xbyak::Reg64;
+using reg32_t = const Xbyak::Reg32;
+using xmm_t = const Xbyak::Xmm;
+using ymm_t = const Xbyak::Ymm;
+using zmm_t = const Xbyak::Zmm;
+using Label = Xbyak::Label;
+typedef enum {
+  MUL = 0,
+  ADD,
+  SUB,
+  RELU,
+  EXP,
+  SIGMOID,
+  TANH,
+  IDENTITY
+} operand_type;
+#define DECLARE_JIT_CODE(codename) \
+  const char* name() const override { return #codename; }
+class JitCode : public GenBase, public Xbyak::CodeGenerator {
+ public:
+  explicit JitCode(size_t code_size, void* code_ptr = nullptr)
+      : Xbyak::CodeGenerator(
+            (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size),
+            code_ptr) {}
+  virtual const char* name() const = 0;
+  virtual void genCode() = 0;
+  size_t getSize() const override { return CodeGenerator::getSize(); }
+  const unsigned char* getCodeInternal() override {
+    const Xbyak::uint8* code = CodeGenerator::getCode();
+    return code;
+  }
+ protected:
+  Xbyak::Reg64 param1{abi_param1};
+  const int EVEX_max_8b_offt = 0x200;
+  const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
+  virtual void preCode() {
+    for (int i = 0; i < num_g_abi_regs; ++i) {
+      push(Xbyak::Reg64(g_abi_regs[i]));
+    }
+    if (platform::MayIUse(platform::avx512f)) {
+      mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
+    }
+  }
+  virtual void postCode() {
+    for (int i = 0; i < num_g_abi_regs; ++i) {
+      pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i]));
+    }
+    ret();
+  }
+  void L(const char* label) { Xbyak::CodeGenerator::L(label); }
+  void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }
+  // Enhanced vector extension
+  Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
+                                    bool bcast = false) {
+    int scale = 0;
+    // Learn from https://github.com/intel/mkl-dnn
+    if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
+      offt = offt - 2 * EVEX_max_8b_offt;
+      scale = 1;
+    } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
+      offt = offt - 4 * EVEX_max_8b_offt;
+      scale = 2;
+    }
+    auto re = Xbyak::RegExp() + base + offt;
+    if (scale) {
+      re = re + reg_EVEX_max_8b_offt * scale;
+    }
+    if (bcast) {
+      return zword_b[re];
+    } else {
+      return zword[re];
+    }
+  }
+};
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/gen/lstm.h"
+#include <stddef.h>  // offsetof
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+void LSTMJitCode::genCode() {
+  if (use_peephole_) {
+    preCode();
+  }
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ct_1 = r9;
+  reg64_t reg_ptr_ct = r10;
+  reg64_t reg_ptr_ht = r11;
+  reg64_t reg_ptr_wp = r12;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
+  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
+  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
+  if (use_peephole_) {
+    mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]);
+  }
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    /* gates: W_ch, W_ih, W_fh, W_oh */
+    ymm_t ymm_c = ymm_t(0);
+    ymm_t ymm_i = ymm_t(1);
+    ymm_t ymm_f = ymm_t(2);
+    ymm_t ymm_o = ymm_t(3);
+    ymm_t ymm_ct_1 = ymm_t(4);
+    ymm_t ymm_wp0 = ymm_t(5);
+    ymm_t ymm_wp1 = ymm_t(6);
+    ymm_t ymm_wp2 = ymm_t(7);
+    vmovups(ymm_c, ptr[reg_ptr_gates + offset]);
+    vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]);
+    vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]);
+    vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]);
+    if (!compute_c1h1_) {
+      vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
+    }
+    if (use_peephole_) {
+      vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]);
+      vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]);
+      vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]);
+    }
+    /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
+    // act_cand(c)
+    act<ymm_t>(ymm_c, ymm_c, act_cand_);
+    // act_gate(i) or act_gate(ct_1 * wp0 + i)
+    if (!compute_c1h1_ && use_peephole_) {
+      vmulps(ymm_wp0, ymm_ct_1, ymm_wp0);
+      vaddps(ymm_i, ymm_i, ymm_wp0);
+    }
+    act<ymm_t>(ymm_i, ymm_i, act_gate_);
+    vmulps(ymm_c, ymm_c, ymm_i);
+    if (!compute_c1h1_) {
+      // act_gate(f) or act_gate(ct_1 * wp1 + f)
+      if (use_peephole_) {
+        vmulps(ymm_wp1, ymm_ct_1, ymm_wp1);
+        vaddps(ymm_f, ymm_f, ymm_wp1);
+      }
+      act<ymm_t>(ymm_f, ymm_f, act_gate_);
+      // ct
+      vmulps(ymm_f, ymm_f, ymm_ct_1);
+      vaddps(ymm_f, ymm_f, ymm_c);
+    }
+    /* H_t = act_cell(C_t) * act_gate(o) */
+    // act_cell(C_t)
+    ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
+    ymm_t ymm_tmp = ymm_i;
+    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
+    // act_gate(o) or act_gate(ct * wp2 + o)
+    if (use_peephole_) {
+      vmulps(ymm_wp2, ymm_ct, ymm_wp2);
+      vaddps(ymm_o, ymm_o, ymm_wp2);
+    }
+    act<ymm_t>(ymm_o, ymm_o, act_gate_);
+    // ht
+    vmulps(ymm_o, ymm_o, ymm_tmp);
+    // save ct and ht
+    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
+    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  if (use_peephole_) {
+    postCode();
+  } else {
+    ret();
+  }
+}
+#define DECLARE_LSTM_CREATOR(name)                                \
+  class name##Creator : public JitCodeCreator<lstm_attr_t> {      \
+   public:                                                        \
+    /* TODO(TJ): enable more */                                   \
+    bool UseMe(const lstm_attr_t& attr) const override {          \
+      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
+    }                                                             \
+    size_t CodeSize(const lstm_attr_t& attr) const override {     \
+      return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;          \
+    }                                                             \
+    std::unique_ptr<GenBase> CreateJitCode(                       \
+        const lstm_attr_t& attr) const override {                 \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));    \
+    }                                                             \
+  }
+DECLARE_LSTM_CREATOR(LSTMCtHt);
+DECLARE_LSTM_CREATOR(LSTMC1H1);
+#undef DECLARE_LSTM_CREATOR
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+namespace gen = paddle::operators::jit::gen;
+REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator);
+REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator);
--- a/paddle/fluid/operators/jit/gen/lstm.h
+++ b/paddle/fluid/operators/jit/gen/lstm.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/act.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+class LSTMJitCode : public VActFunc {
+ public:
+  explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr,
+                       size_t code_size, void* code_ptr = nullptr)
+      : VActFunc(code_size, code_ptr),
+        num_(attr.d),
+        compute_c1h1_(compute_c1h1),
+        use_peephole_(attr.use_peephole) {
+    auto typeExchange = [](KernelType type) -> gen::operand_type {
+      if (type == KernelType::kVSigmoid) {
+        return operand_type::SIGMOID;
+      } else if (type == KernelType::kVRelu) {
+        return operand_type::RELU;
+      } else if (type == KernelType::kVTanh) {
+        return operand_type::TANH;
+      } else if (type == KernelType::kVIdentity) {
+        return operand_type::IDENTITY;
+      } else {
+        LOG(FATAL) << "Do not support this jit::KernelType: " << type;
+      }
+      return operand_type::IDENTITY;
+    };
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+    act_cell_ = typeExchange(attr.act_cell);
+    this->genCode();
+  }
+  const char* name() const override {
+    std::string base = "LSTMJitCode";
+    if (use_peephole_) {
+      base += "_Peephole";
+    }
+    if (compute_c1h1_) {
+      base += "_C1H1";
+    }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::RELU:
+          base += "_Relu";
+          break;
+        case operand_type::EXP:
+          base += "_Exp";
+          break;
+        case operand_type::SIGMOID:
+          base += "_Sigmoid";
+          break;
+        case operand_type::TANH:
+          base += "_Tanh";
+          break;
+        case operand_type::IDENTITY:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    AddTypeStr(act_cell_);
+    return base.c_str();
+  }
+  void genCode() override;
+ protected:
+  int num_;
+  bool compute_c1h1_;
+  bool use_peephole_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  operand_type act_cell_;
+  reg64_t param1{abi_param1};
+};
+#define DECLARE_LSTM_JITCODE(name, compute_c1h1)                      \
+  class name##JitCode : public LSTMJitCode {                          \
+   public:                                                            \
+    explicit name##JitCode(const lstm_attr_t& attr, size_t code_size, \
+                           void* code_ptr = nullptr)                  \
+        : LSTMJitCode(compute_c1h1, attr, code_size, code_ptr) {}     \
+  };
+DECLARE_LSTM_JITCODE(LSTMCtHt, false);
+DECLARE_LSTM_JITCODE(LSTMC1H1, true);
+#undef DECLARE_LSTM_JITCODE
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/gen_base.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
+namespace paddle {
+namespace operators {
+namespace jit {
+// refer do not need useme, it would be the last one.
+void GenBase::dumpCode(const unsigned char* code) const {
+  if (code) {
+    static int counter = 0;
+    std::ostringstream filename;
+    filename << "paddle_jitcode_" << name() << "." << counter << ".bin";
+    counter++;
+    std::ofstream fout(filename.str(), std::ios::out);
+    if (fout.is_open()) {
+      fout.write(reinterpret_cast<const char*>(code), this->getSize());
+      fout.close();
+    }
+  }
+}
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <gflags/gflags.h>
+#include <memory>  // for unique_ptr
+#include "paddle/fluid/operators/jit/kernel_base.h"
+DECLARE_bool(dump_jitcode);
+namespace paddle {
+namespace operators {
+namespace jit {
+class GenBase : public Kernel {
+ public:
+  virtual ~GenBase() = default;
+  virtual const char* name() const = 0;
+  virtual size_t getSize() const = 0;
+  virtual const unsigned char* getCodeInternal() = 0;
+  template <typename Func>
+  Func getCode() {
+    const unsigned char* code = this->getCodeInternal();
+    if (FLAGS_dump_jitcode) {
+      this->dumpCode(code);
+    }
+    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
+    // then workaround with const_cast. Any better idea is appreciated.
+    return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
+  }
+ protected:
+  void dumpCode(const unsigned char* code) const;
+};
+// Creator is used to creat the jitcode and save in pool.
+// Every JitCode should have one creator.
+class GenCreator {
+ public:
+  virtual ~GenCreator() = default;
+};
+template <typename Attr>
+class JitCodeCreator : public GenCreator {
+ public:
+  virtual ~JitCodeCreator() = default;
+  // condition when this jit code can be used.
+  virtual bool UseMe(const Attr& attr) const = 0;
+  // estimate this code size
+  virtual size_t CodeSize(const Attr& attr) const = 0;
+  // create this code
+  virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
+};
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/helper.h"
+#include <algorithm>  // tolower
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+#define ONE_CASE(key) \
+  case key:           \
+    return #key
+const char* to_string(KernelType kt) {
+  switch (kt) {
+    ONE_CASE(kVMul);
+    ONE_CASE(kVAdd);
+    ONE_CASE(kVAddRelu);
+    ONE_CASE(kVSub);
+    ONE_CASE(kVScal);
+    ONE_CASE(kVAddBias);
+    ONE_CASE(kVRelu);
+    ONE_CASE(kVIdentity);
+    ONE_CASE(kVExp);
+    ONE_CASE(kVSigmoid);
+    ONE_CASE(kVTanh);
+    ONE_CASE(kLSTMCtHt);
+    ONE_CASE(kLSTMC1H1);
+    ONE_CASE(kGRUH1);
+    ONE_CASE(kGRUHtPart1);
+    ONE_CASE(kGRUHtPart2);
+    ONE_CASE(kCRFDecoding);
+    ONE_CASE(kLayerNorm);
+    ONE_CASE(kNCHW16CMulNC);
+    default:
+      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
+      return "NOT JITKernel";
+  }
+  return nullptr;
+}
+#undef ONE_CASE
+KernelType to_kerneltype(const std::string& act) {
+  std::string lower = act;
+  std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+  if (lower == "relu" || lower == "vrelu") {
+    return kVRelu;
+  } else if (lower == "identity" || lower == "videntity" || lower == "") {
+    return kVIdentity;
+  } else if (lower == "exp" || lower == "vexp") {
+    return kVExp;
+  } else if (lower == "sigmoid" || lower == "vsigmoid") {
+    return kVSigmoid;
+  } else if (lower == "tanh" || lower == "vtanh") {
+    return kVTanh;
+  }
+  PADDLE_THROW("Not support type: %s, or forget to add this case", act);
+  return kNone;
+}
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/jit/gen_base.h"
+#include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/operators/jit/kernel_key.h"
+#include "paddle/fluid/operators/jit/kernel_pool.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+template <KernelType KT, typename KernelTuples, typename PlaceType>
+inline typename std::enable_if<
+    std::is_same<typename KernelTuples::data_type, float>::value &&
+        std::is_same<PlaceType, platform::CPUPlace>::value,
+    typename KernelTuples::func_type>::type
+GetJitCode(const typename KernelTuples::attr_type& attr) {
+  using Func = typename KernelTuples::func_type;
+  using Attr = typename KernelTuples::attr_type;
+  size_t key = JitCodeKey<Attr>(attr);
+  auto& codes = JitCodePool<KT>().Instance();
+  if (codes.Has(key)) {
+    return codes.AllKernels().at(key)->template getCode<Func>();
+  }
+  // creator is not related with attr, so can use KernelKey as key
+  KernelKey kkey(KT, PlaceType());
+  // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
+  auto& creator_map = JitCodeCreatorPool().Instance().AllCreators();
+  auto iter = creator_map.find(kkey);
+  if (iter != creator_map.end()) {
+    auto& creators = iter->second;
+    for (auto& cur : creators) {
+      auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
+      if (i && i->UseMe(attr)) {
+        auto p = i->CreateJitCode(attr);
+        if (p) {
+          auto f = p->template getCode<Func>();
+          codes.Insert(key, std::move(p));
+          return f;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+template <KernelType KT, typename KernelTuples, typename PlaceType>
+inline typename std::enable_if<
+    !std::is_same<typename KernelTuples::data_type, float>::value ||
+        !std::is_same<PlaceType, platform::CPUPlace>::value,
+    typename KernelTuples::func_type>::type
+GetJitCode(const typename KernelTuples::attr_type& attr) {
+  return nullptr;
+}
+// Refer code do not related with attr, which is just for cast
+// Refer is always on CPUPlace
+template <KernelType KT, typename KernelTuples>
+inline typename KernelTuples::func_type GetRefer() {
+  auto& ref_pool = ReferKernelPool().Instance().AllKernels();
+  KernelKey kkey(KT, platform::CPUPlace());
+  auto ref_iter = ref_pool.find(kkey);
+  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
+                 "Every Kernel should have reference function.");
+  auto& ref_impls = ref_iter->second;
+  for (auto& impl : ref_impls) {
+    auto i = dynamic_cast<const ReferKernel<KernelTuples>*>(impl.get());
+    if (i) {
+      return i->GetFunc();
+    }
+  }
+  return nullptr;
+}
+template <KernelType KT, typename KernelTuples,
+          typename PlaceType = platform::CPUPlace>
+typename KernelTuples::func_type Get(
+    const typename KernelTuples::attr_type& attr) {
+  auto jitfunc = GetJitCode<KT, KernelTuples, PlaceType>(attr);
+  if (jitfunc) {
+    return jitfunc;
+  }
+  // pool: (KernelKey(type, place), vector<KernelPtr>)
+  KernelKey kkey(KT, PlaceType());
+  auto& pool = KernelPool().Instance().AllKernels();
+  auto iter = pool.find(kkey);
+  if (iter != pool.end()) {
+    auto& impls = iter->second;
+    for (auto& impl : impls) {
+      auto i = dynamic_cast<const KernelMore<KernelTuples>*>(impl.get());
+      if (i && i->UseMe(attr)) {
+        return i->GetFunc();
+      }
+    }
+  }
+  // The last implementation should be reference function on CPUPlace.
+  return GetRefer<KT, KernelTuples>();
+}
+const char* to_string(KernelType kt);
+KernelType to_kerneltype(const std::string& act);
+inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
+  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
+     << "],act_cand[" << to_string(attr.act_cand) << "],act_cell["
+     << to_string(attr.act_cell) << "],use_peephole["
+     << (attr.use_peephole ? "True" : "False") << "]";
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
+  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
+     << "],act_cand[" << to_string(attr.act_cand) << "]";
+  return os;
+}
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/kernel_key.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+template <>
+size_t JitCodeKey<int>(const int& d) {
+  return d;
+}
+constexpr int act_type_shift = 3;  // suppot 2^3 act types
+template <>
+size_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
+  size_t key = attr.d;
+  int gate_key = static_cast<int>(attr.act_gate) << 1;
+  int cand_key = static_cast<int>(attr.act_cand) << (1 + act_type_shift);
+  int cell_key = static_cast<int>(attr.act_cell) << (1 + act_type_shift * 2);
+  return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key +
+         attr.use_peephole;
+}
+template <>
+size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
+  size_t key = attr.d;
+  return (key << (act_type_shift * 2)) + static_cast<int>(attr.act_gate) +
+         (static_cast<int>(attr.act_cand) << act_type_shift);
+}
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/kernel_key.h
+++ b/paddle/fluid/operators/jit/kernel_key.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace operators {
+namespace jit {
+struct KernelKey {
+  struct Hash {
+    size_t operator()(const KernelKey& key) const {
+      int place = key.place_.which();               // less than 2^8
+      int type = static_cast<int>(key.type_) << 8;  // less than 2^(32-8)
+      std::hash<int> hasher;
+      return hasher(place + type);
+    }
+  };
+  KernelType type_;
+  platform::Place place_;
+  KernelKey(KernelType type, platform::Place place)
+      : type_(type), place_(place) {}
+  size_t hash_key() const { return Hash()(*this); }
+  bool operator==(const KernelKey& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           type_ == o.type_;
+  }
+  bool operator!=(const KernelKey& o) const { return !(*this == o); }
+};
+// Every JitCode should have a method to get the key from attribution
+template <typename Attr>
+size_t JitCodeKey(const Attr& attr);
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/kernel_pool.cc
+++ b/paddle/fluid/operators/jit/kernel_pool.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jit/kernel_pool.h"
+#include <memory>  // for shared_ptr
+#include <string>
+#include <unordered_map>
+namespace paddle {
+namespace operators {
+namespace jit {
+JitCodeCreatorPool& JitCodeCreatorPool::Instance() {
+  static JitCodeCreatorPool g_creator_pool;
+  return g_creator_pool;
+}
+KernelPool& KernelPool::Instance() {
+  static KernelPool g_kernel_pool;
+  return g_kernel_pool;
+}
+ReferKernelPool& ReferKernelPool::Instance() {
+  static ReferKernelPool g_refer_kernel_pool;
+  return g_refer_kernel_pool;
+}
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/kernel_pool.h
+++ b/paddle/fluid/operators/jit/kernel_pool.h
--- a/paddle/fluid/operators/jit/macro.h
+++ b/paddle/fluid/operators/jit/macro.h
--- a/paddle/fluid/operators/jit/more/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/CMakeLists.txt
--- a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
--- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
--- a/paddle/fluid/operators/math/jit_kernel_refer.h
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
--- a/paddle/fluid/operators/jit/registry.h
+++ b/paddle/fluid/operators/jit/registry.h
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
--- a/paddle/fluid/operators/math/jit_gen.cc
+++ b/paddle/fluid/operators/math/jit_gen.cc
--- a/paddle/fluid/operators/math/jit_gen.h
+++ b/paddle/fluid/operators/math/jit_gen.h
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ b/paddle/fluid/operators/metrics/accuracy_op.cu
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
--- a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
--- a/paddle/fluid/operators/optimizers/momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/momentum_op.cu
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
--- a/paddle/fluid/operators/py_func_op.h
+++ b/paddle/fluid/operators/py_func_op.h
--- a/paddle/fluid/operators/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/quantize_mkldnn_op.cc
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
--- a/paddle/fluid/operators/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
--- a/paddle/scripts/installation_validate.py
+++ b/paddle/scripts/installation_validate.py
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
--- a/python/paddle/fluid/contrib/slim/__init__.py
+++ b/python/paddle/fluid/contrib/slim/__init__.py
--- a/python/paddle/fluid/contrib/slim/core/__init__.py
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
--- a/python/paddle/fluid/contrib/slim/core/compress_pass.py
+++ b/python/paddle/fluid/contrib/slim/core/compress_pass.py
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ b/python/paddle/fluid/contrib/slim/core/config.py
--- a/python/paddle/fluid/contrib/slim/core/pass_builder.py
+++ b/python/paddle/fluid/contrib/slim/core/pass_builder.py
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
--- a/python/paddle/fluid/contrib/slim/graph/__init__.py
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
--- a/python/paddle/fluid/contrib/slim/graph/graph_pass.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
--- a/python/paddle/fluid/contrib/slim/prune/__init__.py
+++ b/python/paddle/fluid/contrib/slim/prune/__init__.py
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
--- a/python/paddle/fluid/contrib/slim/unitest/__init__.py
+++ b/python/paddle/fluid/contrib/slim/unitest/__init__.py
--- a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
--- a/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
--- a/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
--- a/python/paddle/fluid/contrib/slim/unitest/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
--- a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
--- a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/setup.py.in
+++ b/python/setup.py.in