diff --git a/CMakeLists.txt b/CMakeLists.txt
index c31f51a3f7371bd7b1b0ca3234091a35868806ce..66dcef0013efb486b532f9ae17e9ae2040dc9e38 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,16 +126,12 @@ if(ANDROID OR IOS)
     add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 
-if (APPLE OR WIN32)
+if (APPLE)
     set(WITH_MKL OFF CACHE STRING
-        "Disable MKL for building on mac and windows" FORCE)
+        "Disable MKL for building on mac" FORCE)
 endif()
 
 if (WIN32)
-    set(WITH_DSO OFF CACHE STRING
-            "Disable DSO when compiling for Windows" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-            "Disable MKL when compiling for Windows" FORCE)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
     set(WITH_C_API OFF CACHE STRING
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 09bec347dbd569203103eccc7dbc0521c291bc0a..fb899e3d7cd4224acd25a559d0e18a09f552ad7d 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -44,9 +44,9 @@ if(WIN32)
 set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)
 
-if(Apple)
+if(APPLE)
 set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
-endif(Apple)
+endif(APPLE)
 
 find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index b280db23b9b27bc658a79d01ea81122d2c987666..c29375cd0589764507f9a10859a6d4fdbb29716e 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -23,15 +23,14 @@ SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
-IF(WIN32 OR APPLE)
+IF(APPLE)
     MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLDNN in Paddle yet."
+        "Mac is not supported with MKLDNN in Paddle yet."
         "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE)
     return()
 ENDIF()
 
-SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
 MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
@@ -44,10 +43,14 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
     MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
-SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
-SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+
+IF(NOT WIN32)
+    SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
+    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+ENDIF(NOT WIN32)
+
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -58,8 +61,15 @@ ExternalProject_Add(
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
     CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
@@ -67,6 +77,11 @@ ExternalProject_Add(
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                         -DMKLROOT:PATH=${MKLML_ROOT}
 )
+if(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
+else(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+endif(WIN32)
 
 ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
@@ -85,10 +100,14 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
-SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
-ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
-    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-    DEPENDS mkldnn)
+if(WIN32)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll)
+else(WIN32)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+            DEPENDS mkldnn)
+endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
 
 IF(WITH_C_API)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index dc5427acd45f5da90317e7a3dc25f5453e2a7a00..d49839a89d78803f0fad58192283deae47ad72ef 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,56 +16,67 @@ IF(NOT ${WITH_MKLML})
   return()
 ENDIF(NOT ${WITH_MKLML})
 
-IF(WIN32 OR APPLE)
+IF(APPLE)
     MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Mac is not supported with MKLML in Paddle yet."
         "Force WITH_MKLML=OFF")
     SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
     return()
 ENDIF()
 
 INCLUDE(ExternalProject)
-
-SET(MKLML_PROJECT       "extern_mklml")
-IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
-  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
-SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+if(WIN32)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+else()
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
+endif()
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 
-INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+    MESSAGE(STATUS "use pre defined download url")
+    if(WIN32)
+        SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    else()
+        SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    ENDIF()
+endif()
 
-FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(MKLML)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
-  "        DESTINATION ${MKLML_DST_DIR})\n")
+SET(MKLML_PROJECT       "extern_mklml")
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 
 ExternalProject_Add(
     ${MKLML_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${MKLML_SOURCE_DIR}
+    PREFIX                 ${MKLML_SOURCE_DIR}
+    URL                    ${MKLML_URL}
     DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
-                          && tar zxf ${MKLML_VER}.tgz
     DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} &&
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR}
 )
 
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a8b9dcfcf5eec39af0f59c03b1ed9bd4b71ee7bf..c6fe2e970d3e02985e3f2b8d5df6a7358beed514 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,7 +267,11 @@ function(cc_library TARGET_NAME)
           list(APPEND cc_library_DEPS dynload_mklml)
         endif()
         add_dependencies(${TARGET_NAME} mklml)
-        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        if(WIN32)
+          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
+        else(WIN32)
+          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        endif(WIN32)
       endif()
       # remove link to python, see notes at:
       # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 9f0adef7aa603ec5a3c8a5aa347613f462c43e60..48279bc809dde9e97c967c3ea5d03fbd7b89b017 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -115,20 +115,20 @@ if (NOT PROTOBUF_FOUND OR WIN32)
             )
 endif ()
 
-if (NOT CBLAS_FOUND)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
-    copy(openblas_lib
-            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-            DSTS ${dst_dir} ${dst_dir}
-            DEPS extern_openblas
-            )
-elseif (WITH_MKLML)
+if (WITH_MKLML)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
     copy(mklml_lib
             SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
             DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
             DEPS mklml
             )
+elseif (NOT CBLAS_FOUND OR WIN32)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
+    copy(openblas_lib
+            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS extern_openblas
+            )
 endif ()
 
 if (WITH_MKLDNN)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 170e0f839719c71d56008abefb79c7814d0f3e76..b6974c6af290438f827c16bb478eb43e3cf42247 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -208,6 +208,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
@@ -350,6 +351,23 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None))
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index d0beb8361c206a280e3e3ebe797fd49d7a0e691c..2d66dcbf5c01166b68f1c34e7492e9f64ad50a34 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -48,10 +48,10 @@ if(WITH_GPU)
     nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
     add_dependencies(tensor tensor_util)
   else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
   endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 6c8bec32de2a8c1d59155b812c05d5181acb82be..8fbbc6584e121d22bdec8173d501a35dc97c9c06 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -157,13 +157,8 @@ bool CheckLoD(const LoD &in, int tensor_height) {
     if (level.size() < 2) return false;
     // check: the first offset(the begin offset) of each level should be 0.
     if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(no same items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      LOG(INFO) << "ascending error";
+    // check: all the offsets in a level should be ascending(allow same items)
+    if (!std::is_sorted(level.begin(), level.end())) {
       return false;
     }
   }
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index cd50aaa26054b78f1b1e8f0d470b397892155a2b..15928c18d38b8a513b00f993b57faab43978bf53 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -217,6 +217,11 @@ TEST(LoD, CheckLoD) {
   // check with underlying tensor storage.
   ASSERT_TRUE(CheckLoD(relative_lod, 5));
   ASSERT_FALSE(CheckLoD(relative_lod, 9));
+
+  // check whether lod is ascending-sorted (allow same items)
+  ASSERT_TRUE(CheckLoD({{0, 1, 2, 3, 4, 5}}, 5));
+  ASSERT_TRUE(CheckLoD({{0, 1, 3, 3, 4, 5}}, 5));
+  ASSERT_FALSE(CheckLoD({{0, 1, 3, 2, 5}}, 5));
 }
 
 TEST(LoD, CheckAbsLoD) {
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
index 5fcb17b9f3ac390548aba33db7d0b8350cde7e00..42190b52289bfc6fc510f13cb5190a0d3e03b836 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -31,10 +31,12 @@ std::map<std::string,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
     NgraphBridge::NG_NODE_MAP = {
+        {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
         {"mul", paddle::operators::ngraphs::BuildMulNode},
         {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
         {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
-        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}};
+        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
+        {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
 
 void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
   auto& op_type = op->Type();
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index e8debec7f13706b7fc5a4882d237ee2257e53b7e..d7352c5ee5a63bc8b8023e1d3459c5b9f5fab8a7 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -123,6 +123,8 @@ class OpDesc {
 
   BlockDesc *Block() { return this->block_; }
 
+  const BlockDesc *Block() const { return this->block_; }
+
  private:
   template <typename MapType>
   static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b2d4de5916ba89e6692e10ac7ebef53484b576ca..d5992354da646e1e1015ad83a116348dedaf7656 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -476,6 +476,28 @@ const Tensor* ExecutionContext::LegacyInput<Tensor>(
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  if (it == ctx_.inputs.end()) {
+    return {};
+  }
+  const std::vector<Variable*>& vars = it->second;
+  std::vector<const Tensor*> res;
+  res.reserve(vars.size());
+  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                 [&](Variable* var) -> const Tensor* {
+                   if (var == nullptr) return nullptr;
+                   PADDLE_ENFORCE(
+                       var->IsType<LoDTensor>(),
+                       "should be LoDTensor, but the received type is %s",
+                       ToTypeName(var->Type()));
+                   return &(var->Get<LoDTensor>());
+                 });
+  return res;
+}
+
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const {
   auto names = op().Inputs(name);
   std::vector<const Tensor*> res;
   res.reserve(names.size());
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 39190d07b4ccdd5ffd03e2d50bb0e577ac00af75..1fe2daacf1369902cde732422b4e65c3d156250f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -197,8 +197,31 @@ class ExecutionContext {
 
   const std::vector<const Variable*> MultiInputVar(
       const std::string& name) const {
-    auto names = op_.Inputs(name);
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
     std::vector<const Variable*> res;
+    res.reserve(it->second.size());
+    std::transform(it->second.begin(), it->second.end(),
+                   std::back_inserter(res),
+                   [this](Variable* var) { return var; });
+    return res;
+  }
+
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    return it->second;
+  }
+
+  const std::vector<Variable*> LegacyMultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<Variable*> res;
     res.reserve(names.size());
     std::transform(names.begin(), names.end(), std::back_inserter(res),
                    [this](const std::string& name) {
@@ -208,7 +231,7 @@ class ExecutionContext {
     return res;
   }
 
-  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
     auto names = op_.Outputs(name);
     std::vector<Variable*> res;
     res.reserve(names.size());
@@ -250,6 +273,38 @@ class ExecutionContext {
 
   template <typename T>
   const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<const T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> const T* {
+                     return var == nullptr ? nullptr : &var->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<T*> MultiOutput(const std::string& name) const {
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> T* {
+                     return var == nullptr ? nullptr : var->GetMutable<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
     auto names = op_.Inputs(name);
     std::vector<const T*> res;
     res.reserve(names.size());
@@ -262,7 +317,7 @@ class ExecutionContext {
   }
 
   template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
+  std::vector<T*> LegacyMultiOutput(const std::string& name) const {
     auto names = op_.Outputs(name);
     std::vector<T*> res;
     res.reserve(names.size());
@@ -321,6 +376,10 @@ template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const;
 
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const;
+
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
 
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 824f75b3d3cfa03020182d2ea0b2970bdd6aeeca..e0a848273b8d6b50eb1706998e368141a0d1f7f3 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -25,6 +25,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class OperatorBase;
+
 using InferShapeVarPtr = boost::variant<VarDesc *, Variable *>;
 
 class InferShapeContext {
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 57335847a1931de6599560c6e9395a910282b0ee..ef096c2b810187c50fbcde7d93d9e5a2ecd8b0f3 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 
 namespace paddle {
 namespace framework {
@@ -27,6 +28,8 @@ void Tensor::check_memory_size() const {
       "or maybe the required data-type mismatches the data already stored.");
 }
 
+Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
+
 size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
@@ -101,5 +104,12 @@ const DDim& Tensor::dims() const { return dims_; }
 
 int64_t Tensor::numel() const { return product(dims_); }
 
+void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
+  if (holder_) {
+    PADDLE_ENFORCE_EQ(numel() * SizeOfType(type()), holder->size());
+  }
+  holder_ = holder;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6a1cbe5cd567429c922156f8bce7ca710b15a0f5..40606d9b06baf4dbebf87f3c02580e49ae6e2a70 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,6 +69,8 @@ class Tensor {
  public:
   Tensor() : type_(proto::VarType::FP32), offset_(0) {}
 
+  explicit Tensor(const proto::VarType::Type&);
+
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   T* data();
@@ -162,6 +164,8 @@ class Tensor {
     return std::move(holder_);
   }
 
+  void ResetHolder(std::shared_ptr<memory::Allocation> holder);
+
  private:
   /*! holds the memory block if allocated. */
   std::shared_ptr<memory::Allocation> holder_;
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index cab6d9b67e4e64335be0a386bfffb7ebe4373b3e..871c7bd2a77d1cc5057177619b5cd7b2083ff308 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 
 namespace paddle {
 namespace framework {
@@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
                src_ptr, size);
 }
 
+template <typename T>
+paddle::framework::Tensor GetTensor(
+    memory::allocation::AllocationPtr temp_allocation_ptr,
+    const framework::DDim& dim) {
+  auto& deleter = temp_allocation_ptr.get_deleter();
+  auto* allocation_ptr = temp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
+
+  PADDLE_ENFORCE(
+      dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+      "The AllocationPtr must be TemporaryAllocation.");
+  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                    framework::product(dim) * sizeof(T));
+
+  paddle::framework::Tensor temp_tensor(
+      framework::ToDataType(std::type_index(typeid(T))));
+  temp_tensor.Resize(dim);
+  temp_tensor.ResetHolder(std::move(shared_allocation));
+  return temp_tensor;
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index c751e8515829d06970c55f097f50de8bf33ee2a4..3937884ce4a5a16a1093ac8977033eaa98b2678e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                   inputs[i].data.length());
     } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
       auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
       memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
-                   0);  // stream 0 for sync copy
+                   inputs[i].data.length(), dev_ctx->stream());
 #else
       PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 3d121e046004dfe6fc6953e0b23852b9ecda5c1b..102147a493ed1454db1a78124200f163f68e555b 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                   inputs[i].data.length());
     } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
       auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
       memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
-                   0);  // stream 0 for sync copy
+                   inputs[i].data.length(), dev_ctx->stream());
 #else
       PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 8d0d96d391efd7f0f11e9d48f5a6221431bd3824..f42ee9a697bfb4b8fefd4d3ba65afea4e74f0a85 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -89,12 +89,21 @@ endif()
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if(NOT WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+                 ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX}
+            ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif(WIN32)
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
     include_directories("${MKLDNN_PATH}/include")
-    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    if(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+    else(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    endif(WIN32)
   endif()
 else()
   set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 46ce61b73611d05369f90e7d8f97e9b6724b860f..95bbc74a5961eb28a0d8fbd7c680c0740fc68d8a 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -75,6 +75,11 @@ set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
 download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
 
+# MM DNN
+set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn")
+download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc)
+
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 227e2ff45873fded45899146b97a7bee0c8ad763..12d61d06ce188a2478448373427f2defae5a2524 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -254,5 +254,16 @@ TEST(Analyzer_dam, compare) { compare(); }
 TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
 
+// Compare Deterministic result
+TEST(Analyzer_dam, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 310852e2f7cb284bda3041911d0059b55ee3b477..142801382b4fdeaa63f51390b63cf6db6cb8f60d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -180,6 +180,17 @@ TEST(Analyzer_LAC, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare Deterministic result
+TEST(Analyzer_LAC, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8aaab6d6649e1d4b6db7695df0e9dd219c89422c
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+using contrib::AnalysisConfig;
+
+struct DataRecord {
+  std::vector<std::vector<int64_t>> query_data_all, title_data_all;
+  std::vector<size_t> lod1, lod2;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= query_data_all.size()) {
+      data.query_data_all.assign(query_data_all.begin() + batch_iter,
+                                 query_data_all.begin() + batch_end);
+      data.title_data_all.assign(title_data_all.begin() + batch_iter,
+                                 title_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod1.push_back(0);
+      data.lod2.push_back(0);
+      CHECK(!data.query_data_all.empty());
+      CHECK(!data.title_data_all.empty());
+      CHECK_EQ(data.query_data_all.size(), data.title_data_all.size());
+      for (size_t j = 0; j < data.query_data_all.size(); j++) {
+        // calculate lod
+        data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
+        data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, '\t', &data);
+      // load query data
+      std::vector<int64_t> query_data;
+      split_to_int64(data[0], ' ', &query_data);
+      // load title data
+      std::vector<int64_t> title_data;
+      split_to_int64(data[1], ' ', &title_data);
+      query_data_all.push_back(std::move(query_data));
+      title_data_all.push_back(std::move(title_data));
+    }
+    num_samples = num_lines;
+  }
+};
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_query_tensor, lod_title_tensor;
+  lod_query_tensor.name = "left";
+  lod_title_tensor.name = "right";
+  auto one_batch = data->NextBatch();
+  int size1 = one_batch.lod1[one_batch.lod1.size() - 1];  // token batch size
+  int size2 = one_batch.lod2[one_batch.lod2.size() - 1];  // token batch size
+  lod_query_tensor.shape.assign({size1, 1});
+  lod_query_tensor.lod.assign({one_batch.lod1});
+  lod_title_tensor.shape.assign({size2, 1});
+  lod_title_tensor.lod.assign({one_batch.lod2});
+  // assign data
+  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all);
+  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all);
+  // Set inputs.
+  input_slots->assign({lod_query_tensor, lod_title_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+
+void SetConfig(contrib::AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
+  }
+}
+
+// Easy for profiling independently.
+TEST(Analyzer_MM_DNN, profile) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    PADDLE_ENFORCE_EQ(outputs.size(), 2UL);
+    for (auto &output : outputs) {
+      size_t size = GetSize(output);
+      PADDLE_ENFORCE_GT(size, 0);
+      float *result = static_cast<float *>(output.data.data());
+      // output is probability, which is in (-1, 1).
+      for (size_t i = 0; i < size; i++) {
+        EXPECT_GT(result[i], -1);
+        EXPECT_LT(result[i], 1);
+      }
+    }
+  }
+}
+
+// Check the fuse status
+TEST(Analyzer_MM_DNN, fuse_statis) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_MM_DNN, compare) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+// Compare Deterministic result
+TEST(Analyzer_MM_DNN, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 66d85420c5701b1bf308b6850465beb6d8a0b703..f19a2ed59ef2f666393124323ffee2f1e79ccf06 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -179,5 +179,16 @@ TEST(Analyzer_Chinese_ner, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare Deterministic result
+TEST(Analyzer_Chinese_ner, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index abc63577b7913a3c9de7d6c16d8ac3e85ffd7c3c..764ae5ed8506a7ed7dc51a5c36d0dd7e9df925f3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -85,6 +85,17 @@ TEST(Analyzer_resnet50, compare) { compare(); }
 TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
 
+// Compare Deterministic result
+TEST(Analyzer_resnet50, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 1ae2b4b03a1b2a66b3ddc8cb66d9575751a52297..17f4587a5093a2f1cd2d8acc0e17f2129ad36353 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -265,6 +265,17 @@ TEST(Analyzer_rnn1, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare Deterministic result
+TEST(Analyzer_rnn1, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 // Test Multi-Thread.
 TEST(Analyzer_rnn1, multi_thread) {
   contrib::AnalysisConfig cfg;
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index e2985006f0ed858e778bf4737be3aaee0e056021..f8354e76871e7f489fd21f2f74e7402db01845c3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -158,5 +158,16 @@ TEST(Analyzer_rnn2, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare Deterministic result
+TEST(Analyzer_rnn2, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index 858191184a377a26042c98e17d5b8df782575efc..f5082cd60f1ae4e4eaf9dbe59a446ace900ee456 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -204,5 +204,16 @@ TEST(Analyzer_seq_conv1, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare Deterministic result
+TEST(Analyzer_seq_conv1, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 34a241f070fdc62d1b1e94835fb1dad405baafa9..79f3c81ade450fa00419b652042b2cfc79b08e4c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -106,6 +106,17 @@ TEST(Analyzer_Text_Classification, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare Deterministic result
+TEST(Analyzer_Text_Classification, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index a8f7d5c4461964bcb18bc8df24e282ea89264aa8..d73bccefd5fc8a8ad8679b7de3feac50f786daed 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -145,6 +145,17 @@ TEST(Analyzer_vis, compare) { compare(); }
 TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
 
+// Compare Deterministic result
+TEST(Analyzer_vis, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index b07949c196ca1d41bb33a0b0499ebb3204d1be4a..b0c8f395ce05fbfceaec3d8b69367292eca714e4 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -45,6 +45,7 @@ DEFINE_bool(use_analysis, true,
             "Running the inference program in analysis mode.");
 DEFINE_bool(record_benchmark, false,
             "Record benchmark after profiling the model");
+DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
 
 DECLARE_bool(profile);
 DECLARE_int32(paddle_num_threads);
@@ -85,7 +86,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
         float *pdata = static_cast<float *>(out.data.data());
         float *pdata_ref = static_cast<float *>(ref_out.data.data());
         for (size_t j = 0; j < size; ++j) {
-          EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
+          EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy);
         }
         break;
       }
@@ -283,6 +284,26 @@ void TestPrediction(const PaddlePredictor::Config *config,
   }
 }
 
+void CompareDeterministic(
+    const PaddlePredictor::Config *config,
+    const std::vector<std::vector<PaddleTensor>> &inputs) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  auto predictor = CreateTestPredictor(config, FLAGS_use_analysis);
+
+  // warmup run
+  std::vector<PaddleTensor> warmup_outputs, outputs;
+  predictor->Run(inputs[0], &warmup_outputs, batch_size);
+
+  // run num_times to Compare Deterministic Result.
+  for (int i = 0; i < num_times; i++) {
+    for (size_t j = 0; j < inputs.size(); j++) {
+      predictor->Run(inputs[j], &outputs, batch_size);
+      CompareResult(outputs, warmup_outputs);
+    }
+  }
+}
+
 void CompareNativeAndAnalysis(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index d9b0c66e5727e80486423ab065dccf9105775127..4a14eb941cd98e333a3e85aff064e6099b3be396 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -16,6 +16,7 @@ add_subdirectory(metrics)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
+add_subdirectory(jit)
 
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed)
@@ -42,8 +43,7 @@ if (WITH_DISTRIBUTE)
     SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
 endif()
 
-register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
-
+register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
 # warpctc_op needs cudnn 7 above
 if (WITH_GPU AND NOT WIN32)
@@ -65,7 +65,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
@@ -92,4 +92,8 @@ cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
 
+if (WITH_PYTHON)
+  cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
+endif()
+
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 249f308c13ff5636fbaa6747b28cab7886b7e736..2519f5e7acdb7828743c6e114adfe5e530058406 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
@@ -123,6 +124,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
     const int batch_size = static_cast<int>(input->dims()[0]);
 
     // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
@@ -155,13 +158,16 @@ class GemmConvKernel : public framework::OpKernel<T> {
     // to call the matrix multiplication interface.
     Tensor col_matrix;
     if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
 
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+    framework::DDim input_shape =
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
 
     framework::DDim filter_matrix_shape = {filter.dims()[0],
                                            filter.numel() / filter.dims()[0]};
@@ -178,7 +184,6 @@ class GemmConvKernel : public framework::OpKernel<T> {
     math::Vol2ColFunctor<DeviceContext, T> vol2col;
     math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
 
-    auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     for (int i = 0; i < batch_size; i++) {
       Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -237,6 +242,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
     // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
     // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
@@ -262,8 +269,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     framework::DDim col_matrix_shape =
         framework::flatten_to_2d(col_shape, data_dim + 1);
 
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+    framework::DDim input_shape =
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
 
     framework::DDim filter_matrix_shape = {filter.dims()[0],
                                            filter.numel() / filter.dims()[0]};
@@ -286,13 +293,15 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     // to call the matrix multiplication interface.
     Tensor col_matrix;
     if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
 
     math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     if (input_grad) {
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index e9d2e84a434d7084c526a6e75363a65577197262..72774a878d98b431da05cf870139752421b2df8d 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -82,10 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     Tensor track;
     int* track_value =
         track.mutable_data<int>(emission_dims, platform::CPUPlace());
-    const auto& ker = math::jitkernel::KernelPool::Instance()
-                          .template Get<math::jitkernel::CRFDecodeKernel<T>>(
-                              static_cast<int>(tag_num));
-    ker->Compute(static_cast<int>(seq_len), x, w, alpha_value, track_value);
+    auto ker = jit::Get<jit::kCRFDecoding, jit::CRFDecodingTuples<T>,
+                        platform::CPUPlace>(tag_num);
+    ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num);
     T max_score = -std::numeric_limits<T>::max();
     int max_i = 0;
     for (size_t i = 0; i < tag_num; ++i) {
diff --git a/paddle/fluid/operators/dequantize_mkldnn_op.cc b/paddle/fluid/operators/dequantize_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..262b7408a7f5f65c4d97120914c16f38ce5fdbe7
--- /dev/null
+++ b/paddle/fluid/operators/dequantize_mkldnn_op.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/dequantize_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using platform::to_void_cast;
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+
+template <typename T>
+class DeQuantOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto scale_data = ctx.Attr<float>("Scale");
+    auto* output = ctx.Output<Tensor>("Output");
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& engine = dev_ctx.GetEngine();
+
+    const T* input_data = input->data<T>();
+    float* output_data = output->mutable_data<float>(ctx.GetPlace());
+    std::vector<float> reorder_scale = {1.0f / scale_data};
+
+    std::vector<primitive> pipeline;
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    mkldnn::memory::data_type src_dt =
+        paddle::framework::ToMKLDNNDataType(input->type());
+    mkldnn::memory::format src_fmt = input->format();
+
+    mkldnn::primitive_attr attri;
+    int mask = 0;
+    attri.set_output_scales(mask, reorder_scale);
+
+    auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
+    auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
+    auto src_memory =
+        std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
+    std::shared_ptr<primitive::at> src_memory_p =
+        std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
+
+    auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32,
+                                          memory::format::nchw);
+    auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
+    auto dst_memory = mkldnn::memory(dst_pd, to_void_cast<float>(output_data));
+
+    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
+        new reorder::primitive_desc(src_pd, dst_pd, attri));
+    auto reorder_p = std::shared_ptr<reorder>(
+        new reorder(*reorder_pd, *src_memory_p, dst_memory));
+    pipeline.push_back(*reorder_p);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    output->set_format(GetMKLDNNFormat(dst_memory));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>);
diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38159f84a0d56f45cfef233a3c70c3c6cef17d9f
--- /dev/null
+++ b/paddle/fluid/operators/dequantize_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/dequantize_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+framework::OpKernelType DeQuantOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
+  framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
+
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_);
+}
+
+void DeQuantOpMaker::Make() {
+  AddInput("Input", "input data");
+  AddOutput("Output", "output data");
+  AddAttr<float>("Scale", "scale data").SetDefault({1.0f});
+  AddComment(R"DOC(This op will dequantize data from INT8 to FP32)DOC");
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
diff --git a/paddle/fluid/operators/dequantize_op.h b/paddle/fluid/operators/dequantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..75c27a06c210f2d0e4d7cf52aa16f4c123f8ad8e
--- /dev/null
+++ b/paddle/fluid/operators/dequantize_op.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+class DeQuantOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
+    ctx->ShareLoD("Input", /*->*/ "Output");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class DeQuantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+class DeQuantGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
index 6a92762896b89a06a91cd11fb38587f7df69e6c3..acd5993154ed03f206f20082231feb5059ef32e1 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     vars->mutable_data<T>(ctx.GetPlace());
 
     framework::Tensor d_temp;
-    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+    framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
 
     // At least use 32 threads, at most 512 threads.
     // blockx is multiple of 32.
     int blockx = std::min(
-        static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L);
+        static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
+        512L);
     int gridx = (feature_width * num_priors + blockx - 1) / blockx;
     dim3 threads(blockx, 1);
     dim3 grids(gridx, feature_height);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index c600d1e3d76f7a989dd61e72caf4967aa5923c6f..4c73a70ed1ce2435bfc1a0f3d45afe9b6e3c4cf6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -16,11 +16,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
-#include "paddle/fluid/operators/math/jit_kernel.h"
+#ifdef PADDLE_WITH_XBYAK
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -81,8 +84,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
     UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
 
-    Xbyak::util::Cpu cpu;
-    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
+    const bool is_avx512_enabled = platform::MayIUse(platform::avx512f);
     const bool are_dims_divisable = !(x_int_dims[1] % 16);
     const bool is_x_format_correct = x->format() == memory::format::nChw16c;
     const bool is_y_format_correct = y->format() == memory::format::nc;
@@ -108,10 +110,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
         constexpr int simd_width = 16;
         int C = c / simd_width;
 
-        const auto& multiply =
-            math::jitkernel::KernelPool::Instance()
-                .template Get<math::jitkernel::EltwiseMulnChw16cNCKernel<T>>(n);
-
+        auto multiply = jit::Get<jit::kNCHW16CMulNC, jit::NCHW16CMulNCTuples<T>,
+                                 platform::CPUPlace>(0);
 #pragma omp parallel for collapse(2)
         for (int ni = 0; ni < n; ni++) {
           for (int ci = 0; ci < C; ci++) {
@@ -122,7 +122,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
             auto ptr_z =
                 z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
 
-            multiply->Compute(ptr_x, ptr_y, ptr_z, h, w);
+            multiply(ptr_x, ptr_y, ptr_z, h, w);
           }
         }
       }
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 4ce67e16dd0c4b15db26bc6556ab4715436c091b..66acba49e5ac25c5097042225ccfe30b258040fa 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
 namespace paddle {
@@ -182,27 +182,29 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   const int total_T = x_dims[0];           \
   const int D3 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                         \
-  auto* h0 = ctx.Input<Tensor>("H0");                              \
-  auto* wx = ctx.Input<Tensor>("WeightX");                         \
-  auto* bias = ctx.Input<Tensor>("Bias");                          \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");              \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");                  \
-  const int M = x_dims[1];                                         \
-  const int D = wh_dims[0];                                        \
-  const int D2 = D * 2;                                            \
-  const math::jitkernel::gru_attr_t attr(                          \
-      D, ctx.Attr<std::string>("gate_activation"),                 \
-      ctx.Attr<std::string>("activation"));                        \
-  math::jitkernel::gru_t one_step;                                 \
-  const auto& ker =                                                \
-      math::jitkernel::KernelPool::Instance()                      \
-          .template Get<math::jitkernel::GRUKernel<T>,             \
-                        const math::jitkernel::gru_attr_t&>(attr); \
-  const T* x_data = x->data<T>();                                  \
-  const T* wx_data = wx->data<T>();                                \
-  const T* wh_data = wh->data<T>();                                \
-  auto place = ctx.GetPlace();                                     \
+#define INIT_OTHER_DEFINES                                                     \
+  auto* h0 = ctx.Input<Tensor>("H0");                                          \
+  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
+  auto* bias = ctx.Input<Tensor>("Bias");                                      \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
+  const int M = x_dims[1];                                                     \
+  const int D = wh_dims[0];                                                    \
+  const int D2 = D * 2;                                                        \
+  const jit::gru_attr_t attr(                                                  \
+      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),         \
+      jit::to_kerneltype(ctx.Attr<std::string>("activation")));                \
+  jit::gru_t one_step;                                                         \
+  auto ComputeH1 =                                                             \
+      jit::Get<jit::kGRUH1, jit::GRUTuples<T>, platform::CPUPlace>(attr);      \
+  auto ComputeHtPart1 =                                                        \
+      jit::Get<jit::kGRUHtPart1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
+  auto ComputeHtPart2 =                                                        \
+      jit::Get<jit::kGRUHtPart2, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
+  const T* x_data = x->data<T>();                                              \
+  const T* wx_data = wx->data<T>();                                            \
+  const T* wh_data = wh->data<T>();                                            \
+  auto place = ctx.GetPlace();                                                 \
   T* xx_data = xx->mutable_data<T>(place)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
@@ -241,7 +243,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       } else {
         one_step.gates = xx_data;
         one_step.ht = hidden_out_data;
-        ker->ComputeH1(&one_step, &attr);
+        ComputeH1(&one_step, &attr);
         prev_hidden_data = hidden_out_data;
         tstart = 1;
         move_step();
@@ -254,12 +256,12 @@ class FusionGRUKernel : public framework::OpKernel<T> {
         one_step.gates = xx_data;
         one_step.ht_1 = prev_hidden_data;
         one_step.ht = hidden_out_data;
-        ker->ComputeHtPart1(&one_step, &attr);
+        ComputeHtPart1(&one_step, &attr);
         // gemm rt * Ws
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
                   hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
                   xx_data + D2, D3);
-        ker->ComputeHtPart2(&one_step, &attr);
+        ComputeHtPart2(&one_step, &attr);
         // save prev
         prev_hidden_data = hidden_out_data;
         move_step();
@@ -323,7 +325,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       for (int i = 0; i < max_bs; ++i) {
         one_step.gates = cur_in_data;
         one_step.ht = cur_out_data;
-        ker->ComputeH1(&one_step, &attr);
+        ComputeH1(&one_step, &attr);
         // add offset
         cur_in_data += D3;
         cur_out_data += D;
@@ -351,7 +353,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
         one_step.gates = cur_batched_data;
         one_step.ht_1 = cur_prev_hidden_data;
         one_step.ht = cur_out_data;
-        ker->ComputeHtPart1(&one_step, &attr);
+        ComputeHtPart1(&one_step, &attr);
 
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
@@ -369,7 +371,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
         one_step.gates = cur_batched_data;
         one_step.ht_1 = cur_prev_hidden_data;
         one_step.ht = cur_out_data;
-        ker->ComputeHtPart2(&one_step, &attr);
+        ComputeHtPart2(&one_step, &attr);
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index c4e752e3f0ce7e6d5e1f692fcb9a0290369b4243..b11b7c11bfe0ae4c79d5bb39844bce618649c44d 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
 #include <string>
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
 namespace paddle {
@@ -235,31 +235,32 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
   const int D = wh_dims[0];                                 \
   const int D4 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                      \
-  const T* x_data = x->data<T>();                               \
-  const T* wx_data = wx->data<T>();                             \
-  const T* wh_data = wh->data<T>();                             \
-  /* diagonal weight*/                                          \
-  const T* wp_data = bias->data<T>() + D4;                      \
-  /* for peephole only*/                                        \
-  T* checked_cell_data = nullptr;                               \
-  auto place = ctx.GetPlace();                                  \
-  if (use_peepholes) {                                          \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/            \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");     \
-    checked_cell_data = checked_cell->mutable_data<T>(place);   \
-  }                                                             \
-  const math::jitkernel::lstm_attr_t attr(                      \
-      D, ctx.Attr<std::string>("gate_activation"),              \
-      ctx.Attr<std::string>("candidate_activation"),            \
-      ctx.Attr<std::string>("cell_activation"), use_peepholes); \
-  math::jitkernel::lstm_t one_step;                             \
-  one_step.wp = wp_data;                                        \
-  one_step.checked = checked_cell_data;                         \
-  const auto& ker =                                             \
-      math::jitkernel::KernelPool::Instance()                   \
-          .template Get<math::jitkernel::LSTMKernel<T>,         \
-                        const math::jitkernel::lstm_attr_t&>(attr)
+#define INIT_OTHER_DEFINES                                                    \
+  const T* x_data = x->data<T>();                                             \
+  const T* wx_data = wx->data<T>();                                           \
+  const T* wh_data = wh->data<T>();                                           \
+  /* diagonal weight*/                                                        \
+  const T* wp_data = bias->data<T>() + D4;                                    \
+  /* for peephole only*/                                                      \
+  T* checked_cell_data = nullptr;                                             \
+  auto place = ctx.GetPlace();                                                \
+  if (use_peepholes) {                                                        \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                          \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                   \
+    checked_cell_data = checked_cell->mutable_data<T>(place);                 \
+  }                                                                           \
+  const jit::lstm_attr_t attr(                                                \
+      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),        \
+      jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")),      \
+      jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")),           \
+      use_peepholes);                                                         \
+  jit::lstm_t one_step;                                                       \
+  one_step.wp = wp_data;                                                      \
+  one_step.checked = checked_cell_data;                                       \
+  auto ComputeC1H1 =                                                          \
+      jit::Get<jit::kLSTMC1H1, jit::LSTMTuples<T>, platform::CPUPlace>(attr); \
+  auto ComputeCtHt =                                                          \
+      jit::Get<jit::kLSTMCtHt, jit::LSTMTuples<T>, platform::CPUPlace>(attr)
 
 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
@@ -305,7 +306,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
         one_step.gates = xx_data;
         one_step.ct = c_out_data;
         one_step.ht = h_out_data;
-        ker->ComputeC1H1(&one_step, &attr);
+        ComputeC1H1(&one_step, &attr);
         tstart = 1;
         // move one step
         prev_h_data = h_out_data;
@@ -321,7 +322,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
         one_step.ct_1 = prev_c_data;
         one_step.ct = c_out_data;
         one_step.ht = h_out_data;
-        ker->ComputeCtHt(&one_step, &attr);
+        ComputeCtHt(&one_step, &attr);
         // move one step
         prev_h_data = h_out_data;
         prev_c_data = c_out_data;
@@ -401,7 +402,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
         one_step.gates = cur_in_data;
         one_step.ct = cur_c_out_data;
         one_step.ht = cur_h_out_data;
-        ker->ComputeC1H1(&one_step, &attr);
+        ComputeC1H1(&one_step, &attr);
 
         cur_in_data += D4;
         cur_c_out_data += D;
@@ -431,7 +432,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
         one_step.ct_1 = cur_prev_c_data;
         one_step.ct = cur_c_out_data;
         one_step.ht = cur_h_out_data;
-        ker->ComputeCtHt(&one_step, &attr);
+        ComputeCtHt(&one_step, &attr);
 
         // move one batch
         cur_in_data += D4;
diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..262094f9224407bb412f5b189a748efe13cb04b2
--- /dev/null
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -0,0 +1,25 @@
+
+set(jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h)
+file(WRITE ${jit_file} "// Generated by the paddle/fluid/operators/jit/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${jit_file} "\#pragma once\n")
+file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
+file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
+
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place)
+
+file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
+cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
+
+# refer must go first
+add_subdirectory(refer)
+add_subdirectory(more)
+if(WITH_XBYAK)
+    add_subdirectory(gen)
+endif()
+
+cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
+cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
+if(NOT WIN32)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer)
+endif()
diff --git a/paddle/fluid/operators/jit/README.en.md b/paddle/fluid/operators/jit/README.en.md
new file mode 100644
index 0000000000000000000000000000000000000000..8670ec2ff28ac8353217e0ee2f8c9b784e488ac7
--- /dev/null
+++ b/paddle/fluid/operators/jit/README.en.md
@@ -0,0 +1,76 @@
+# JIT Kernel
+
+JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
+Each implementations has its own condition to use, defined in `UseMe`.
+They are combined together to get the best performance of one single independent function.
+They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
+And they can be composed with some other exited jit kernels to build up a complex function. 
+Currently it's only supported on CPU yet.
+
+## Contents
+
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+
+All basical definations of jit kernels are addressed in `paddle/fluid/operators/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality.
+
+- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries.
+- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak.
+- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. 
+
+## How to use
+
+One simple function `jit::Get`, which is very easy to use, is supported to get the kernel.
+It can automatically return the expected function with best performance under the given attributes. 
+All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, you can only include this one header to get all the registered kernels.
+
+## Solid Test
+
+- Unit Test
+    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
+- Benchmark
+    All functions should be tested, and make sure the `jit::Get` function obtain the best performance with all attributes.
+
+# How to add new kernel
+
+## Required
+
+1. Add `your_key` at `KernelType`.
+2. Add reference function of `your_key`. 
+Note:
+    - this should be run on CPU and do not depend on any third-party.
+    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+3. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
+Test more data type for some special functions if necessary, for example `int8`.
+4. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `jit::Get` always get the best one.
+
+## Optional
+
+Add more implementations of `your_kery` for performance enhancement.
+
+1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corepsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
+Note: Add new `KernelTuples` if necessary，your can refer to `XYZNTuples`.
+Specialie method `JitCodeKey` when add new attribute type。
+2. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc19f09f56ddf6a7c74d6605ab3f1bd059f19bb8
--- /dev/null
+++ b/paddle/fluid/operators/jit/README.md
@@ -0,0 +1,66 @@
+# JIT Kernel
+
+结合函数模板和JIT生成需要的kernel函数。
+这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。可以有多重第三方库的实现，每种实现有自己的`UseMe`函数负责什么条件下可以被调用。
+这里实现的函数可以非常细粒度的函数方法，比如Vector MUL， 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。
+目前仅支持CPU上的高性能计算。
+
+## 目录结构
+
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+
+基本类的定义都放在根目录下，根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现，每种kernel算子都需要有reference的实现，用作单元测试的基准，其他的实现都是可选的。
+- gen: 代表使用jit生成的code，需要依赖xbyak库。该实现最关心的就是性能。
+- refer: 代表reference的实现，每种kernel算子都需要有在CPU上的reference的实现，他主要关心的算法逻辑的正确性。
+- more: 下面可以放入跟多实现，可以包括mkl，mkldnn，intrinsic，openblas等，也可以是自身已有的kernel组合。
+
+## 动态获取
+
+提供一个`jit::Get`方法，根据kernel类别获取，每种实现都有自己的使用范围，根据范围动态和当前条件选择需要的kernel函数。
+
+## 测试
+
+- 逻辑测试
+    所有实现都要与refer的code对比，需要满足精度要求， 包括float和double的数据类型
+- 性能测试
+    所有实现的性能对比，并且与最终的`jit::Get`方法对比，该方法拿到的性能需要在各种条件下都是最好的。
+
+# 如何添加新的算子
+
+- 在`KernelType` 中添加 `your_key` .
+- 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel.
+- (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
+- (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
+- 必要时可以添加新的`KernelTuples`，可以参考`XYZNTuples`，新加的Attr类型需要特例化`JitCodeKey`方法。
+- 在`test.cc`中添加unit test，至少需要测试`float`和`double`两种数据类型，如有必要需要支持额外的数据类型，比如`int8`的相关函数。
+- 在`benchmark.cc`中添加相应的性能对比，同一种kernel需要对比所有实现，并且确保`jit::Get`得到的实现一直是速度最快的。
+
+# 优点
+- 统一的Get方法，接口简单。
+- 同一套逻辑可以有多套实现，可以依赖多套第三方库，互不影响。
+- 目录结构清晰，不会在某个文件中有多个宏定义，导致的可读性差问题。
+- 优化方便，可以直接针对某种属性针对性优化，并不影响其他属性下的性能。
+- 可以支持多种平台，包括Linux，Mac 和 Windows，至少可以保证每种平台都可以正常work。后期也可以针对不同平台有针对的优化。框架层面可以使用统一接口，不必关心底层实现。
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..437005825db7e0718b52ac830dd56ac87069ed39
--- /dev/null
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -0,0 +1,231 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/port.h"
+
+DEFINE_int32(burning, 10, "Burning times.");
+DEFINE_int32(repeat, 3000, "Repeat times.");
+DEFINE_int32(max_size, 1000, "The Max size would be tested.");
+
+template <typename T>
+void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
+               const T upper = static_cast<T>(20.f), unsigned int seed = 100) {
+  std::mt19937 rng(seed);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+std::vector<int> TestSizes() {
+  std::vector<int> s;
+  for (int i = 1; i <= FLAGS_max_size; ++i) {
+    s.push_back(i);
+  }
+  return s;
+}
+
+template <typename KernelTuples, typename... Args>
+struct BenchFunc {
+  // return this function avg time
+  double operator()(const typename KernelTuples::func_type tgt, Args... args) {
+    for (int i = 0; i < FLAGS_burning; ++i) {
+      tgt(args...);
+    }
+    auto start = paddle::platform::PosixInNsec() / 1e-3;
+    for (int i = 0; i < FLAGS_repeat; ++i) {
+      tgt(args...);
+    }
+    auto end = paddle::platform::PosixInNsec() / 1e-3;
+    return static_cast<double>(end - start) / FLAGS_repeat;
+  }
+};
+
+namespace jit = paddle::operators::jit;
+
+template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
+          typename... Args>
+void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
+  BenchFunc<KernelTuples, Args...> benchmark;
+  std::vector<std::pair<std::string, double>> infos;
+  // test refer
+  auto refer = jit::GetRefer<KT, KernelTuples>();
+  if (!refer) {
+    LOG(FATAL) << "Refer can not be empty!";
+  }
+  infos.push_back(std::make_pair("Refer", benchmark(refer, args...)));
+
+  // test jitcode
+  auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
+  if (jitcode) {
+    infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...)));
+  }
+  // test all impls in more
+  jit::KernelKey kkey(KT, PlaceType());
+  auto& pool = jit::KernelPool().Instance().AllKernels();
+  auto iter = pool.find(kkey);
+  if (iter != pool.end()) {
+    auto& impls = iter->second;
+    for (auto& impl : impls) {
+      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
+      if (i && i->UseMe(attr)) {
+        auto more = i->GetFunc();
+        infos.push_back(
+            std::make_pair(i->ImplType(), benchmark(more, args...)));
+      }
+    }
+  }
+  // Test result from Get function
+  auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
+  if (!tgt) {
+    LOG(FATAL) << "Target can not be empty!";
+  }
+  infos.push_back(std::make_pair("Target", benchmark(tgt, args...)));
+
+  // print
+  std::ostringstream loginfos;
+  loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": ";
+  for (auto pair : infos) {
+    loginfos << pair.first << " takes " << pair.second << " us; ";
+  }
+  LOG(INFO) << loginfos.str();
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchXYZNKernel() {
+  for (int d : TestSizes()) {
+    std::vector<T> x(d), y(d), z(d);
+    RandomVec<T>(d, x.data());
+    RandomVec<T>(d, y.data());
+    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data(), y.data(),
+                                                     z.data(), d);
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchAXYNKernel() {
+  for (int d : TestSizes()) {
+    const T a = static_cast<T>(3);
+    std::vector<T> x(d), y(d);
+    RandomVec<T>(d, x.data());
+    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data(), y.data(),
+                                                     d);
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchXYNKernel() {
+  for (int d : TestSizes()) {
+    std::vector<T> x(d), y(d);
+    RandomVec<T>(d, x.data());
+    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data(), y.data(), d);
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchLSTMKernel() {
+  for (bool use_peephole : {true, false}) {
+    for (int d : TestSizes()) {
+      const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
+                                  use_peephole);
+      std::vector<T> x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d);
+      RandomVec<T>(4 * d, x.data(), -2.f, 2.f);
+      RandomVec<T>(3 * d, wp.data(), -2.f, 2.f);
+      RandomVec<T>(d, ct_1.data(), -2.f, 2.f);
+      const T* ct_1_data = ct_1.data();
+      const T* wp_data = wp.data();
+      T* x_data = x.data();
+      T* checked_data = checked.data();
+      T* ct_data = ct.data();
+      T* ht_data = ht.data();
+      jit::lstm_t step;
+      step.gates = x_data;
+      step.ct_1 = ct_1_data;
+      step.ct = ct_data;
+      step.ht = ht_data;
+      if (use_peephole) {
+        step.wp = wp_data;
+        step.checked = checked_data;
+      }
+      BenchAllImpls<KT, jit::LSTMTuples<T>, PlaceType>(attr, &step, &attr);
+    }
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchGRUKernel() {
+  for (int d : TestSizes()) {
+    const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
+    std::vector<T> x(3 * d), ht_1(d), ht(d);
+    RandomVec<T>(3 * d, x.data(), -2.f, 2.f);
+    RandomVec<T>(d, ht_1.data(), -2.f, 2.f);
+    const T* ht_1_data = ht_1.data();
+    T* x_data = x.data();
+    T* ht_data = ht.data();
+    jit::gru_t step;
+    step.gates = x_data;
+    step.ht_1 = ht_1_data;
+    step.ht = ht_data;
+    BenchAllImpls<KT, jit::GRUTuples<T>, PlaceType>(attr, &step, &attr);
+  }
+}
+
+// Benchmark all jit kernels including jitcode, mkl and refer.
+// To use this tool, run command: ./benchmark [options...]
+// Options:
+//     --burning: the burning time before count
+//     --repeat: the repeat times
+//     --max_size: the max size would be tested
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  google::InitGoogleLogging(argv[0]);
+  LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
+            << " times.";
+  using T = float;
+  using PlaceType = paddle::platform::CPUPlace;
+  // xyzn
+  BenchXYZNKernel<jit::kVMul, T, PlaceType>();
+  BenchXYZNKernel<jit::kVAdd, T, PlaceType>();
+  BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>();
+  BenchXYZNKernel<jit::kVSub, T, PlaceType>();
+
+  // axyn
+  BenchAXYNKernel<jit::kVScal, T, PlaceType>();
+  BenchAXYNKernel<jit::kVAddBias, T, PlaceType>();
+
+  // xyn
+  BenchXYNKernel<jit::kVRelu, T, PlaceType>();
+  BenchXYNKernel<jit::kVIdentity, T, PlaceType>();
+  BenchXYNKernel<jit::kVExp, T, PlaceType>();
+  BenchXYNKernel<jit::kVSigmoid, T, PlaceType>();
+  BenchXYNKernel<jit::kVTanh, T, PlaceType>();
+
+  // lstm and peephole
+  BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>();
+  BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>();
+
+  // gru functions
+  BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
+  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
+  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
+}
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8a540108302f77e1ca3bfe1db0013d76a22d5eb4
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -0,0 +1,28 @@
+
+file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+
+cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
+set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
+
+function(USE_JITKERNEL_GEN TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
+endfunction()
+
+# use gen jitcode kernel by name
+USE_JITKERNEL_GEN(kVMul)
+USE_JITKERNEL_GEN(kVAdd)
+#USE_JITKERNEL_GEN(kVSub) # TODO(TJ): enable me
+USE_JITKERNEL_GEN(kVAddRelu)
+USE_JITKERNEL_GEN(kVScal)
+USE_JITKERNEL_GEN(kVAddBias)
+USE_JITKERNEL_GEN(kVRelu)
+USE_JITKERNEL_GEN(kVIdentity)
+USE_JITKERNEL_GEN(kVExp)
+USE_JITKERNEL_GEN(kVSigmoid)
+USE_JITKERNEL_GEN(kVTanh)
+USE_JITKERNEL_GEN(kLSTMCtHt)
+USE_JITKERNEL_GEN(kLSTMC1H1)
+USE_JITKERNEL_GEN(kGRUH1)
+USE_JITKERNEL_GEN(kGRUHtPart1)
+USE_JITKERNEL_GEN(kGRUHtPart2)
+USE_JITKERNEL_GEN(kNCHW16CMulNC)
diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ea076f217dc7c8a755055d3f48c22b7a3627012
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/act.h"
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
+    REPEAT_8TIMES(1.f),
+    REPEAT_8TIMES(2.f),
+    REPEAT_8TIMES(0.5f),
+    REPEAT_8TIMES(EXP_HIG),
+    REPEAT_8TIMES(EXP_LOW),
+    REPEAT_8TIMES(CEPHES_LOG2EF),
+    REPEAT_8TIMES(CEPHES_EXP_C1),
+    REPEAT_8TIMES(CEPHES_EXP_C2),
+    REPEAT_8TIMES(CEPHES_EXP_P0),
+    REPEAT_8TIMES(CEPHES_EXP_P1),
+    REPEAT_8TIMES(CEPHES_EXP_P2),
+    REPEAT_8TIMES(CEPHES_EXP_P3),
+    REPEAT_8TIMES(CEPHES_EXP_P4),
+    REPEAT_8TIMES(CEPHES_EXP_P5),
+    REPEAT_8TIMES(EXP_MAX_INPUT),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
+
+const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
+int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
+
+void VActJitCode::genCode() {
+  int offset = 0;
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    vmovups(ymm_src, ptr[param1 + offset]);
+    act<ymm_t>(ymm_dst, ymm_src, type_);
+    vmovups(ptr[param2 + offset], ymm_dst);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  int rest = num_ % YMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
+    if (rest >= 4) {
+      block = 4;
+      vmovups(xmm_src, ptr[param1 + offset]);
+    } else if (rest >= 2) {
+      block = 2;
+      vmovq(xmm_src, ptr[param1 + offset]);
+    } else {
+      block = 1;
+      vmovss(xmm_src, ptr[param1 + offset]);
+    }
+    act<xmm_t>(xmm_dst, xmm_src, type_);
+    if (rest >= 4) {
+      vmovups(ptr[param2 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param2 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param2 + offset], xmm_dst);
+    }
+    offset += sizeof(float) * block;
+    rest -= block;
+  }
+  ret();
+}
+
+#define DECLARE_ACT_CREATOR(name)                                            \
+  class name##Creator : public JitCodeCreator<int> {                         \
+   public:                                                                   \
+    bool UseMe(const int& attr) const override {                             \
+      return platform::MayIUse(platform::avx);                               \
+    }                                                                        \
+    size_t CodeSize(const int& d) const override;                            \
+    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
+    }                                                                        \
+  }
+
+DECLARE_ACT_CREATOR(VRelu);
+DECLARE_ACT_CREATOR(VIdentity);
+DECLARE_ACT_CREATOR(VExp);
+DECLARE_ACT_CREATOR(VSigmoid);
+DECLARE_ACT_CREATOR(VTanh);
+
+// TODO(TJ): tuning use me
+size_t VReluCreator::CodeSize(const int& d) const {
+  return 96 /* init size */ +
+         (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
+             8 /* average bytes for each instruction */;
+}
+
+size_t VIdentityCreator::CodeSize(const int& d) const {
+  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
+}
+
+size_t VExpCreator::CodeSize(const int& d) const {
+  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 70 * 8;
+}
+
+size_t VSigmoidCreator::CodeSize(const int& d) const {
+  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 82 * 8;
+}
+
+size_t VTanhCreator::CodeSize(const int& d) const {
+  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 84 * 8;
+}
+
+#undef DECLARE_ACT_CREATOR
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
+REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
+REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
+REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
+REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/jit/gen/act.h
similarity index 52%
rename from paddle/fluid/operators/math/jit_code.h
rename to paddle/fluid/operators/jit/gen/act.h
index 6d22bf675724166d0701e9a51d0d23ae00ef1048..81503c42ab5cd46961378847584f68f2cbed0ed5 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@@ -1,48 +1,28 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
 
 #pragma once
 
 #include <string>
-#include "paddle/fluid/operators/math/jit_gen.h"
-#include "paddle/fluid/operators/math/jit_kernel_impl.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
 
 namespace paddle {
 namespace operators {
-namespace math {
-namespace jitkernel {
+namespace jit {
 namespace gen {
 
-using reg64_t = const Xbyak::Reg64;
-using reg32_t = const Xbyak::Reg32;
-using xmm_t = const Xbyak::Xmm;
-using ymm_t = const Xbyak::Ymm;
-using zmm_t = const Xbyak::Zmm;
-using Label = Xbyak::Label;
-
-typedef enum {
-  mul = 0,
-  add,
-  sub,
-  relu,
-  exp,
-  sigmoid,
-  tanh,
-  identity
-} operand_type;
-
 extern const float exp_float_consts[];
 extern const int exp_int_0x7f[];
 extern int g_tmp_mem[];
@@ -79,94 +59,15 @@ extern int g_tmp_mem[];
 #define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
 #define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
 
-// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
-class VXXJitCode : public JitCode {
- public:
-  const char* name() const override {
-    std::string base = "VXXJitCode";
-    if (scalar_index_ == 1) {
-      base += "_Scalar";
-    } else {
-      base += "_Vec";
-    }
-    if (type_ == operand_type::mul) {
-      base += "_Mul";
-    } else if (type_ == operand_type::add) {
-      base += "_Add";
-    }
-    if (scalar_index_ == 2) {
-      base += "_Scalar";
-    } else {
-      base += "_Vec";
-    }
-    base += (with_relu_ ? "_Relu" : "");
-    return base.c_str();
-  }
-  explicit VXXJitCode(int d, operand_type type, int scalar_index,
-                      bool with_relu, size_t code_size = 256 * 1024,
-                      void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr),
-        num_(d),
-        type_(type),
-        scalar_index_(scalar_index),
-        with_relu_(with_relu) {}
-  static bool init(int d, int scalar_index = 0);
-  void generate() override;
-
- private:
-  int num_;
-  operand_type type_;
-  int scalar_index_;
-  bool with_relu_;
-  reg64_t param1{abi_param1};
-  reg64_t param2{abi_param2};
-  reg64_t param3{abi_param3};
-
-  xmm_t xmm_src1 = xmm_t(0);
-  xmm_t xmm_src2 = xmm_t(1);
-  xmm_t xmm_dst = xmm_t(2);
-  xmm_t xmm_zero = xmm_t(3);
-
-  ymm_t ymm_src1 = ymm_t(0);
-  ymm_t ymm_src2 = ymm_t(1);
-  ymm_t ymm_dst = ymm_t(2);
-  ymm_t ymm_zero = ymm_t(3);
-};
-
-class VActJitCode : public JitCode {
+class VActFunc : public JitCode {
  public:
-  const char* name() const override {
-    std::string base = "VActJitCode";
-    switch (type_) {
-      case operand_type::relu:
-        base += "_Relu";
-        break;
-      case operand_type::exp:
-        base += "_Exp";
-        break;
-      case operand_type::sigmoid:
-        base += "_Sigmoid";
-        break;
-      case operand_type::tanh:
-        base += "_Tanh";
-        break;
-      case operand_type::identity:
-        base += "_Identity";
-        break;
-      default:
-        break;
-    }
-    return base.c_str();
-  }
-
-  explicit VActJitCode(int d, operand_type type, size_t code_size = 256 * 1024,
-                       void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), num_(d), type_(type) {}
-  static bool init(int d, operand_type type);
-  void generate() override;
+  explicit VActFunc(size_t code_size, void* code_ptr)
+      : JitCode(code_size, code_ptr) {}
+  virtual const char* name() const = 0;
+  virtual void genCode() = 0;
 
  protected:
-  // compute relu with ymm, xmm
+  // compute RELU with ymm, xmm
   template <typename JMM>
   void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) {  // NOLINT
     JMM zero = JMM(zero_idx);
@@ -174,7 +75,7 @@ class VActJitCode : public JitCode {
     vmaxps(dst, src, zero);
   }
 
-  // compute exp with ymm, xmm
+  // compute EXP with ymm, xmm
   template <typename JMM>
   void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
                int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
@@ -258,7 +159,7 @@ class VActJitCode : public JitCode {
     pop(reg_ptr_global);
   }
 
-  // compute sigmoid with ymm, xmm
+  // compute SIGMOID with ymm, xmm
   template <typename JMM>
   void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
                    int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
@@ -283,7 +184,7 @@ class VActJitCode : public JitCode {
     pop(reg_ptr_global);
   }
 
-  // compute tanh with ymm, xmm
+  // compute TANH with ymm, xmm
   template <typename JMM>
   void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
                 int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
@@ -310,223 +211,109 @@ class VActJitCode : public JitCode {
     pop(reg_ptr_global);
   }
 
+  // compute IDENTITY with ymm, xmm
+  template <typename JMM>
+  void identity_jmm(JMM& dst, JMM& src, int zero_idx) {  // NOLINT
+    JMM zero = JMM(zero_idx);
+    vxorps(zero, zero, zero);
+    vaddps(dst, src, zero);
+    // TODO(TJ): use below
+    // dst.setIdx(src.getIdx());
+  }
+
   template <typename JMM>
   void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
     // use 11~15
     switch (type) {
-      case operand_type::relu:
+      case operand_type::RELU:
         relu_jmm<JMM>(dst, src, 15);
         break;
-      case operand_type::exp:
+      case operand_type::EXP:
         exp_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
         break;
-      case operand_type::sigmoid:
+      case operand_type::SIGMOID:
         sigmoid_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
         break;
-      case operand_type::tanh:
+      case operand_type::TANH:
         tanh_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
         break;
-      case operand_type::identity:
+      case operand_type::IDENTITY:
+        identity_jmm<JMM>(dst, src, 15);
         break;
       default:
-        // throw error
+        LOG(FATAL) << "Do not support this operand type: " << type;
         break;
     }
   }
-
- protected:
-  int num_;
-  operand_type type_;
-  reg64_t param1{abi_param1};
-  reg64_t param2{abi_param2};
-
-  xmm_t xmm_src = xmm_t(0);
-  ymm_t ymm_src = ymm_t(0);
-
-  xmm_t xmm_dst = xmm_t(1);
-  ymm_t ymm_dst = ymm_t(1);
 };
 
-class LSTMJitCode : public VActJitCode {
+class VActJitCode : public VActFunc {
  public:
-  const char* name() const override {
-    std::string base = "LSTMJitCode";
-    if (use_peephole_) {
-      base += "_Peephole";
-    }
-    if (compute_c1h1_) {
-      base += "_C1H1";
+  explicit VActJitCode(int d, operand_type type, size_t code_size,
+                       void* code_ptr = nullptr)
+      : VActFunc(code_size, code_ptr), num_(d), type_(type) {
+    if (!(type_ == operand_type::RELU || type_ == operand_type::EXP ||
+          type_ == operand_type::SIGMOID || type_ == operand_type::TANH ||
+          type_ == operand_type::IDENTITY)) {
+      LOG(FATAL) << "Do not support this operand type: " << type_;
     }
-    auto AddTypeStr = [&](operand_type type) {
-      switch (type) {
-        case operand_type::relu:
-          base += "_Relu";
-          break;
-        case operand_type::exp:
-          base += "_Exp";
-          break;
-        case operand_type::sigmoid:
-          base += "_Sigmoid";
-          break;
-        case operand_type::tanh:
-          base += "_Tanh";
-          break;
-        case operand_type::identity:
-          base += "_Identity";
-          break;
-        default:
-          break;
-      }
-    };
-    AddTypeStr(act_gate_);
-    AddTypeStr(act_cand_);
-    AddTypeStr(act_cell_);
-    return base.c_str();
-  }
-
-  explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr,
-                       size_t code_size = 256 * 1024, void* code_ptr = nullptr)
-      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
-                    code_ptr),
-        compute_c1h1_(compute_c1h1) {
-    auto typeExchange = [](const std::string& type) -> gen::operand_type {
-      if (type == "sigmoid") {
-        return operand_type::sigmoid;
-      } else if (type == "relu") {
-        return operand_type::relu;
-      } else if (type == "tanh") {
-        return operand_type::tanh;
-      } else if (type == "identity" || type == "") {
-        return operand_type::identity;
-      }  // else throw error
-      return operand_type::identity;
-    };
-    num_ = attr.d;
-    use_peephole_ = attr.use_peephole;
-    act_gate_ = typeExchange(attr.act_gate);
-    act_cand_ = typeExchange(attr.act_cand);
-    act_cell_ = typeExchange(attr.act_cell);
+    this->genCode();
   }
-  static bool init(int d);
-  void generate() override;
-
- protected:
-  int num_;
-  bool compute_c1h1_;
-  bool use_peephole_;
-  operand_type act_gate_;
-  operand_type act_cand_;
-  operand_type act_cell_;
-  reg64_t param1{abi_param1};
-};
 
-class GRUJitCode : public VActJitCode {
- public:
   const char* name() const override {
-    std::string base = "GRUJitCode";
-    if (id_ == 0) {
-      base += "_H1";
-    } else if (id_ == 1) {
-      base += "_HtPart1";
-    } else if (id_ == 2) {
-      base += "_HtPart2";
+    std::string base = "VActJitCode";
+    switch (type_) {
+      case operand_type::RELU:
+        base += "_Relu";
+        break;
+      case operand_type::EXP:
+        base += "_Exp";
+        break;
+      case operand_type::SIGMOID:
+        base += "_Sigmoid";
+        break;
+      case operand_type::TANH:
+        base += "_Tanh";
+        break;
+      case operand_type::IDENTITY:
+        base += "_Identity";
+        break;
+      default:
+        break;
     }
-    auto AddTypeStr = [&](operand_type type) {
-      switch (type) {
-        case operand_type::relu:
-          base += "_Relu";
-          break;
-        case operand_type::exp:
-          base += "_Exp";
-          break;
-        case operand_type::sigmoid:
-          base += "_Sigmoid";
-          break;
-        case operand_type::tanh:
-          base += "_Tanh";
-          break;
-        case operand_type::identity:
-          base += "_Identity";
-          break;
-        default:
-          break;
-      }
-    };
-    AddTypeStr(act_gate_);
-    AddTypeStr(act_cand_);
     return base.c_str();
   }
-
-  explicit GRUJitCode(int id, const gru_attr_t& attr,
-                      size_t code_size = 256 * 1024, void* code_ptr = nullptr)
-      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
-                    code_ptr),
-        id_(id) {
-    auto typeExchange = [](const std::string& type) -> gen::operand_type {
-      if (type == "sigmoid") {
-        return operand_type::sigmoid;
-      } else if (type == "relu") {
-        return operand_type::relu;
-      } else if (type == "tanh") {
-        return operand_type::tanh;
-      } else if (type == "identity" || type == "") {
-        return operand_type::identity;
-      }  // else throw error
-      return operand_type::identity;
-    };
-    num_ = attr.d;
-    act_gate_ = typeExchange(attr.act_gate);
-    act_cand_ = typeExchange(attr.act_cand);
-  }
-  static bool init(int d);
-  void generate() override;
+  void genCode() override;
 
  protected:
-  int id_;
   int num_;
-  operand_type act_gate_;
-  operand_type act_cand_;
+  operand_type type_;
   reg64_t param1{abi_param1};
-};
+  reg64_t param2{abi_param2};
 
-#ifdef PADDLE_WITH_MKLDNN
-struct EltwiseMulnChw16cNC : public Xbyak::CodeGenerator {
-  explicit EltwiseMulnChw16cNC(size_t code_size = 256 * 1024)
-      : Xbyak::CodeGenerator(code_size) {
-    // RDI is ptr x_input
-    // RSI is ptr y_input
-    // RDX is ptr output
-    // RCX is height
-    // r8 is width
+  xmm_t xmm_src = xmm_t(0);
+  ymm_t ymm_src = ymm_t(0);
 
-    push(rbx);
+  xmm_t xmm_dst = xmm_t(1);
+  ymm_t ymm_dst = ymm_t(1);
+};
 
-    xor_(rax, rax);
-    xor_(r10, r10);
-    vmovups(zmm3, ptr[rsi]);
+#define DECLARE_ACT_JITCODE(name, op_type)                                    \
+  class name##JitCode : public VActJitCode {                                  \
+   public:                                                                    \
+    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
+        : VActJitCode(d, op_type, code_size, code_ptr) {}                     \
+  };
 
-    L("h_loop");
-    xor_(rbx, rbx);
-    L("w_loop");
-    vmovups(zmm2, ptr[rdi + rax]);
-    vmulps(zmm1, zmm2, zmm3);
-    vmovups(ptr[rdx + rax], zmm1);
-    add(rax, 64);
-    inc(rbx);
-    cmp(r8, rbx);
-    jnz("w_loop");
-    inc(r10);
-    cmp(r10, rcx);
-    jnz("h_loop");
+DECLARE_ACT_JITCODE(VRelu, operand_type::RELU);
+DECLARE_ACT_JITCODE(VIdentity, operand_type::IDENTITY);
+DECLARE_ACT_JITCODE(VExp, operand_type::EXP);
+DECLARE_ACT_JITCODE(VSigmoid, operand_type::SIGMOID);
+DECLARE_ACT_JITCODE(VTanh, operand_type::TANH);
 
-    pop(rbx);
-    ret();
-  }
-};
-#endif
+#undef DECLARE_ACT_JITCODE
 
 }  // namespace gen
-}  // namespace jitkernel
-}  // namespace math
+}  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1198773088faa594bac0714dd8449b240b3ce4d
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/blas.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/blas.h"
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void VXXJitCode::genCode() {
+  // do not need push stack, and do not need save avx512reg if do not use avx512
+  int offset = 0;
+  if (with_relu_) {
+    vxorps(ymm_zero, ymm_zero, ymm_zero);
+  }
+  if (scalar_index_ == 1) {
+    vbroadcastss(ymm_src1, ptr[param1]);
+  } else if (scalar_index_ == 2) {
+    vbroadcastss(ymm_src2, ptr[param2]);
+  }
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    if (scalar_index_ != 1) {
+      vmovups(ymm_src1, ptr[param1 + offset]);
+    }
+    if (scalar_index_ != 2) {
+      vmovups(ymm_src2, ptr[param2 + offset]);
+    }
+    if (type_ == operand_type::MUL) {
+      vmulps(ymm_dst, ymm_src1, ymm_src2);
+    } else if (type_ == operand_type::ADD) {
+      vaddps(ymm_dst, ymm_src1, ymm_src2);
+    }
+    if (with_relu_) {
+      vmaxps(ymm_dst, ymm_zero, ymm_dst);
+    }
+    vmovups(ptr[param3 + offset], ymm_dst);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  int rest = num_ % YMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
+    if (rest >= 4) {
+      block = 4;
+      if (scalar_index_ != 1) {
+        vmovups(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovups(xmm_src2, ptr[param2 + offset]);
+      }
+    } else if (rest >= 2) {
+      block = 2;
+      if (scalar_index_ != 1) {
+        vmovq(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovq(xmm_src2, ptr[param2 + offset]);
+      }
+    } else {
+      block = 1;
+      if (scalar_index_ != 1) {
+        vmovss(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovss(xmm_src2, ptr[param2 + offset]);
+      }
+    }
+    switch (type_) {
+      case operand_type::MUL:
+        vmulps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      case operand_type::ADD:
+        vaddps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      default:
+        break;
+    }
+    if (with_relu_) {
+      vmaxps(xmm_dst, xmm_zero, xmm_dst);
+    }
+    if (rest >= 4) {
+      vmovups(ptr[param3 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param3 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param3 + offset], xmm_dst);
+    }
+    offset += sizeof(float) * block;
+    rest -= block;
+  }
+  ret();
+}
+
+void NCHW16CMulNCJitCode::genCode() {
+  // RDI is ptr x_input
+  // RSI is ptr y_input
+  // RDX is ptr output
+  // RCX is height
+  // r8 is width
+
+  push(rbx);
+
+  xor_(rax, rax);
+  xor_(r10, r10);
+  vmovups(zmm3, ptr[rsi]);
+
+  L("h_loop");
+  xor_(rbx, rbx);
+  L("w_loop");
+  vmovups(zmm2, ptr[rdi + rax]);
+  vmulps(zmm1, zmm2, zmm3);
+  vmovups(ptr[rdx + rax], zmm1);
+  add(rax, 64);
+  inc(rbx);
+  cmp(r8, rbx);
+  jnz("w_loop");
+  inc(r10);
+  cmp(r10, rcx);
+  jnz("h_loop");
+
+  pop(rbx);
+  ret();
+}
+
+class NCHW16CMulNCCreator : public JitCodeCreator<int> {
+ public:
+  bool UseMe(const int& attr) const override {
+    return platform::MayIUse(platform::avx512f);
+  }
+  size_t CodeSize(const int& d) const override { return 256 * 1024; }
+  std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
+    return make_unique<NCHW16CMulNCJitCode>(attr, CodeSize(attr));
+  }
+};
+
+#define DECLARE_BLAS_CREATOR(name)                                           \
+  class name##Creator : public JitCodeCreator<int> {                         \
+   public:                                                                   \
+    bool UseMe(const int& attr) const override {                             \
+      return platform::MayIUse(platform::avx);                               \
+    }                                                                        \
+    size_t CodeSize(const int& d) const override {                           \
+      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
+    }                                                                        \
+    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
+    }                                                                        \
+  }
+
+DECLARE_BLAS_CREATOR(VMul);
+DECLARE_BLAS_CREATOR(VAdd);
+DECLARE_BLAS_CREATOR(VSub);
+DECLARE_BLAS_CREATOR(VAddRelu);
+DECLARE_BLAS_CREATOR(VScal);
+DECLARE_BLAS_CREATOR(VAddBias);
+
+#undef DECLARE_BLAS_CREATOR
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
+REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
+// TODO(TJ): enable sub
+// REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
+REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
+REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
+REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
+REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h
new file mode 100644
index 0000000000000000000000000000000000000000..c46ec15fb788c0c7a90cfc8732aad375a9e226a1
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/blas.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
+class VXXJitCode : public JitCode {
+ public:
+  explicit VXXJitCode(int d, operand_type type, int scalar_index,
+                      bool with_relu, size_t code_size = 256 * 1024,
+                      void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr),
+        num_(d),
+        type_(type),
+        scalar_index_(scalar_index),
+        with_relu_(with_relu) {
+    if (!(type_ == operand_type::MUL || type_ == operand_type::ADD)) {
+      LOG(FATAL) << "Do not support this operand type: " << type_;
+    }
+    this->genCode();
+  }
+
+  virtual const char* name() const {
+    std::string base = "VXXJitCode";
+    if (scalar_index_ == 1) {
+      base += "_Scalar";
+    } else {
+      base += "_Vec";
+    }
+    if (type_ == operand_type::MUL) {
+      base += "_Mul";
+    } else if (type_ == operand_type::ADD) {
+      base += "_Add";
+    }
+    if (scalar_index_ == 2) {
+      base += "_Scalar";
+    } else {
+      base += "_Vec";
+    }
+    base += (with_relu_ ? "_Relu" : "");
+    return base.c_str();
+  }
+  void genCode() override;
+
+ private:
+  int num_;
+  operand_type type_;
+  int scalar_index_;
+  bool with_relu_;
+  reg64_t param1{abi_param1};
+  reg64_t param2{abi_param2};
+  reg64_t param3{abi_param3};
+
+  xmm_t xmm_src1 = xmm_t(0);
+  xmm_t xmm_src2 = xmm_t(1);
+  xmm_t xmm_dst = xmm_t(2);
+  xmm_t xmm_zero = xmm_t(3);
+
+  ymm_t ymm_src1 = ymm_t(0);
+  ymm_t ymm_src2 = ymm_t(1);
+  ymm_t ymm_dst = ymm_t(2);
+  ymm_t ymm_zero = ymm_t(3);
+};
+
+#define DECLARE_BLAS_JITCODE(name, op_type, scalar_idx, with_relu)             \
+  class name##JitCode : public VXXJitCode {                                    \
+   public:                                                                     \
+    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr)  \
+        : VXXJitCode(d, op_type, scalar_idx, with_relu, code_size, code_ptr) { \
+    }                                                                          \
+  };
+
+DECLARE_BLAS_JITCODE(VMul, operand_type::MUL, 0, false);
+DECLARE_BLAS_JITCODE(VAdd, operand_type::ADD, 0, false);
+DECLARE_BLAS_JITCODE(VSub, operand_type::SUB, 0, false);
+DECLARE_BLAS_JITCODE(VAddRelu, operand_type::ADD, 0, true);
+DECLARE_BLAS_JITCODE(VScal, operand_type::MUL, 1, false);
+DECLARE_BLAS_JITCODE(VAddBias, operand_type::ADD, 1, false);
+
+#undef DECLARE_BLAS_JITCODE
+
+// nChw16c = nChw16c .* NC
+class NCHW16CMulNCJitCode : public JitCode {
+ public:
+  DECLARE_JIT_CODE(NCHW16CMulNCJitCode);
+  explicit NCHW16CMulNCJitCode(int d /*unused*/, size_t code_size,
+                               void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr) {
+    this->genCode();
+  }
+  void genCode() override;
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc
new file mode 100644
index 0000000000000000000000000000000000000000..13f7a14111a80632a06c7fc632da47c0802828f7
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/gru.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/gru.h"
+#include <stddef.h>  // offsetof
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void GRUJitCode::genCode() {
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ht_1 = r9;
+  reg64_t reg_ptr_ht = r10;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]);
+  mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]);
+  ymm_t ymm_one = ymm_t(0);
+
+  if (id_ == 2) {
+    reg64_t reg_ptr_tmp = r11;
+    mov(reg_ptr_tmp, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
+  }
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    ymm_t ymm_u = ymm_t(1);
+    ymm_t ymm_r = ymm_t(2);
+    ymm_t ymm_s = ymm_t(3);
+    ymm_t ymm_ht_1 = ymm_t(4);
+    // W: {W_update, W_reset; W_state}
+    if (id_ == 0 || id_ == 2) {
+      vmovups(ymm_u, ptr[reg_ptr_gates + offset]);
+      vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]);
+    }
+    if (id_ == 1) {
+      vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]);
+    }
+    if (id_ == 1 || id_ == 2) {
+      vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]);
+    }
+
+    if (id_ == 0) {
+      // ht = act_gate(u) * act_cand(s)
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_s);
+    } else if (id_ == 1) {
+      // ht = act_gate(r) * ht_1
+      act<ymm_t>(ymm_r, ymm_r, act_gate_);
+      vmulps(ymm_r, ymm_r, ymm_ht_1);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_r);
+    } else if (id_ == 2) {
+      // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+      ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx());
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vsubps(ymm_u, ymm_one_inner, ymm_u);
+      vmulps(ymm_u, ymm_ht_1, ymm_u);
+      vaddps(ymm_u, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_u);
+    }
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  ret();
+}
+
+#define DECLARE_GRU_CREATOR(name)                                 \
+  class name##Creator : public JitCodeCreator<gru_attr_t> {       \
+   public:                                                        \
+    /* TODO(TJ): enable more */                                   \
+    bool UseMe(const gru_attr_t& attr) const override {           \
+      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
+    }                                                             \
+    size_t CodeSize(const gru_attr_t& attr) const override {      \
+      return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;          \
+    }                                                             \
+    std::unique_ptr<GenBase> CreateJitCode(                       \
+        const gru_attr_t& attr) const override {                  \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));    \
+    }                                                             \
+  }
+
+DECLARE_GRU_CREATOR(GRUH1);
+DECLARE_GRU_CREATOR(GRUHtPart1);
+DECLARE_GRU_CREATOR(GRUHtPart2);
+
+#undef DECLARE_GRU_CREATOR
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator);
+REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator);
+REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator);
diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4d7222a3459d175fc5eaf5cdf0e7a1a610f8b0c
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/gru.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/act.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class GRUJitCode : public VActFunc {
+ public:
+  explicit GRUJitCode(int id, const gru_attr_t& attr, size_t code_size,
+                      void* code_ptr = nullptr)
+      : VActFunc(code_size, code_ptr), id_(id), num_(attr.d) {
+    auto typeExchange = [](KernelType type) -> gen::operand_type {
+      if (type == KernelType::kVSigmoid) {
+        return operand_type::SIGMOID;
+      } else if (type == KernelType::kVRelu) {
+        return operand_type::RELU;
+      } else if (type == KernelType::kVTanh) {
+        return operand_type::TANH;
+      } else if (type == KernelType::kVIdentity) {
+        return operand_type::IDENTITY;
+      } else {
+        LOG(FATAL) << "Do not support this jit::KernelType: " << type;
+      }
+      return operand_type::IDENTITY;
+    };
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+
+    this->genCode();
+  }
+
+  const char* name() const override {
+    std::string base = "GRUJitCode";
+    if (id_ == 0) {
+      base += "_H1";
+    } else if (id_ == 1) {
+      base += "_HtPart1";
+    } else if (id_ == 2) {
+      base += "_HtPart2";
+    }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::RELU:
+          base += "_Relu";
+          break;
+        case operand_type::EXP:
+          base += "_Exp";
+          break;
+        case operand_type::SIGMOID:
+          base += "_Sigmoid";
+          break;
+        case operand_type::TANH:
+          base += "_Tanh";
+          break;
+        case operand_type::IDENTITY:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    return base.c_str();
+  }
+  void genCode() override;
+
+ protected:
+  int id_;
+  int num_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  reg64_t param1{abi_param1};
+};
+
+#define DECLARE_GRU_JITCODE(name, id)                                \
+  class name##JitCode : public GRUJitCode {                          \
+   public:                                                           \
+    explicit name##JitCode(const gru_attr_t& attr, size_t code_size, \
+                           void* code_ptr = nullptr)                 \
+        : GRUJitCode(id, attr, code_size, code_ptr) {}               \
+  };
+
+DECLARE_GRU_JITCODE(GRUH1, 0);
+DECLARE_GRU_JITCODE(GRUHtPart1, 1);
+DECLARE_GRU_JITCODE(GRUHtPart2, 2);
+
+#undef DECLARE_GRU_JITCODE
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b7234c1cb5d15d290685a3dceb3b757be1ef0c6
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -0,0 +1,126 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include "paddle/fluid/operators/jit/gen_base.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+#define XBYAK_USE_MMAP_ALLOCATOR
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+// Application Binary Interface
+constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
+    abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
+    abi_param4(Xbyak::Operand::RCX);
+
+constexpr Xbyak::Operand::Code g_abi_regs[] = {
+    Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
+    Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15};
+
+constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]);
+
+using reg64_t = const Xbyak::Reg64;
+using reg32_t = const Xbyak::Reg32;
+using xmm_t = const Xbyak::Xmm;
+using ymm_t = const Xbyak::Ymm;
+using zmm_t = const Xbyak::Zmm;
+using Label = Xbyak::Label;
+
+typedef enum {
+  MUL = 0,
+  ADD,
+  SUB,
+  RELU,
+  EXP,
+  SIGMOID,
+  TANH,
+  IDENTITY
+} operand_type;
+
+#define DECLARE_JIT_CODE(codename) \
+  const char* name() const override { return #codename; }
+
+class JitCode : public GenBase, public Xbyak::CodeGenerator {
+ public:
+  explicit JitCode(size_t code_size, void* code_ptr = nullptr)
+      : Xbyak::CodeGenerator(
+            (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size),
+            code_ptr) {}
+
+  virtual const char* name() const = 0;
+  virtual void genCode() = 0;
+
+  size_t getSize() const override { return CodeGenerator::getSize(); }
+  const unsigned char* getCodeInternal() override {
+    const Xbyak::uint8* code = CodeGenerator::getCode();
+    return code;
+  }
+
+ protected:
+  Xbyak::Reg64 param1{abi_param1};
+  const int EVEX_max_8b_offt = 0x200;
+  const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
+
+  virtual void preCode() {
+    for (int i = 0; i < num_g_abi_regs; ++i) {
+      push(Xbyak::Reg64(g_abi_regs[i]));
+    }
+    if (platform::MayIUse(platform::avx512f)) {
+      mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
+    }
+  }
+  virtual void postCode() {
+    for (int i = 0; i < num_g_abi_regs; ++i) {
+      pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i]));
+    }
+    ret();
+  }
+  void L(const char* label) { Xbyak::CodeGenerator::L(label); }
+  void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }
+  // Enhanced vector extension
+  Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
+                                    bool bcast = false) {
+    int scale = 0;
+    // Learn from https://github.com/intel/mkl-dnn
+    if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
+      offt = offt - 2 * EVEX_max_8b_offt;
+      scale = 1;
+    } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
+      offt = offt - 4 * EVEX_max_8b_offt;
+      scale = 2;
+    }
+    auto re = Xbyak::RegExp() + base + offt;
+    if (scale) {
+      re = re + reg_EVEX_max_8b_offt * scale;
+    }
+    if (bcast) {
+      return zword_b[re];
+    } else {
+      return zword[re];
+    }
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08bafb5a81882072129a4bfa86d5aff2d33a79a1
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/lstm.h"
+#include <stddef.h>  // offsetof
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void LSTMJitCode::genCode() {
+  if (use_peephole_) {
+    preCode();
+  }
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ct_1 = r9;
+  reg64_t reg_ptr_ct = r10;
+  reg64_t reg_ptr_ht = r11;
+  reg64_t reg_ptr_wp = r12;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
+  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
+  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
+  if (use_peephole_) {
+    mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]);
+  }
+
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    /* gates: W_ch, W_ih, W_fh, W_oh */
+    ymm_t ymm_c = ymm_t(0);
+    ymm_t ymm_i = ymm_t(1);
+    ymm_t ymm_f = ymm_t(2);
+    ymm_t ymm_o = ymm_t(3);
+    ymm_t ymm_ct_1 = ymm_t(4);
+    ymm_t ymm_wp0 = ymm_t(5);
+    ymm_t ymm_wp1 = ymm_t(6);
+    ymm_t ymm_wp2 = ymm_t(7);
+    vmovups(ymm_c, ptr[reg_ptr_gates + offset]);
+    vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]);
+    vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]);
+    vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]);
+    if (!compute_c1h1_) {
+      vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
+    }
+    if (use_peephole_) {
+      vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]);
+      vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]);
+      vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]);
+    }
+    /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
+    // act_cand(c)
+    act<ymm_t>(ymm_c, ymm_c, act_cand_);
+    // act_gate(i) or act_gate(ct_1 * wp0 + i)
+    if (!compute_c1h1_ && use_peephole_) {
+      vmulps(ymm_wp0, ymm_ct_1, ymm_wp0);
+      vaddps(ymm_i, ymm_i, ymm_wp0);
+    }
+    act<ymm_t>(ymm_i, ymm_i, act_gate_);
+    vmulps(ymm_c, ymm_c, ymm_i);
+    if (!compute_c1h1_) {
+      // act_gate(f) or act_gate(ct_1 * wp1 + f)
+      if (use_peephole_) {
+        vmulps(ymm_wp1, ymm_ct_1, ymm_wp1);
+        vaddps(ymm_f, ymm_f, ymm_wp1);
+      }
+      act<ymm_t>(ymm_f, ymm_f, act_gate_);
+      // ct
+      vmulps(ymm_f, ymm_f, ymm_ct_1);
+      vaddps(ymm_f, ymm_f, ymm_c);
+    }
+    /* H_t = act_cell(C_t) * act_gate(o) */
+    // act_cell(C_t)
+    ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
+    ymm_t ymm_tmp = ymm_i;
+    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
+    // act_gate(o) or act_gate(ct * wp2 + o)
+    if (use_peephole_) {
+      vmulps(ymm_wp2, ymm_ct, ymm_wp2);
+      vaddps(ymm_o, ymm_o, ymm_wp2);
+    }
+    act<ymm_t>(ymm_o, ymm_o, act_gate_);
+    // ht
+    vmulps(ymm_o, ymm_o, ymm_tmp);
+    // save ct and ht
+    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
+    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+
+  if (use_peephole_) {
+    postCode();
+  } else {
+    ret();
+  }
+}
+
+#define DECLARE_LSTM_CREATOR(name)                                \
+  class name##Creator : public JitCodeCreator<lstm_attr_t> {      \
+   public:                                                        \
+    /* TODO(TJ): enable more */                                   \
+    bool UseMe(const lstm_attr_t& attr) const override {          \
+      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
+    }                                                             \
+    size_t CodeSize(const lstm_attr_t& attr) const override {     \
+      return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;          \
+    }                                                             \
+    std::unique_ptr<GenBase> CreateJitCode(                       \
+        const lstm_attr_t& attr) const override {                 \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));    \
+    }                                                             \
+  }
+
+DECLARE_LSTM_CREATOR(LSTMCtHt);
+DECLARE_LSTM_CREATOR(LSTMC1H1);
+
+#undef DECLARE_LSTM_CREATOR
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator);
+REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator);
diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4753bca23de91c74415d41c372cde1610712ef7
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/lstm.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/act.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class LSTMJitCode : public VActFunc {
+ public:
+  explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr,
+                       size_t code_size, void* code_ptr = nullptr)
+      : VActFunc(code_size, code_ptr),
+        num_(attr.d),
+        compute_c1h1_(compute_c1h1),
+        use_peephole_(attr.use_peephole) {
+    auto typeExchange = [](KernelType type) -> gen::operand_type {
+      if (type == KernelType::kVSigmoid) {
+        return operand_type::SIGMOID;
+      } else if (type == KernelType::kVRelu) {
+        return operand_type::RELU;
+      } else if (type == KernelType::kVTanh) {
+        return operand_type::TANH;
+      } else if (type == KernelType::kVIdentity) {
+        return operand_type::IDENTITY;
+      } else {
+        LOG(FATAL) << "Do not support this jit::KernelType: " << type;
+      }
+      return operand_type::IDENTITY;
+    };
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+    act_cell_ = typeExchange(attr.act_cell);
+
+    this->genCode();
+  }
+
+  const char* name() const override {
+    std::string base = "LSTMJitCode";
+    if (use_peephole_) {
+      base += "_Peephole";
+    }
+    if (compute_c1h1_) {
+      base += "_C1H1";
+    }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::RELU:
+          base += "_Relu";
+          break;
+        case operand_type::EXP:
+          base += "_Exp";
+          break;
+        case operand_type::SIGMOID:
+          base += "_Sigmoid";
+          break;
+        case operand_type::TANH:
+          base += "_Tanh";
+          break;
+        case operand_type::IDENTITY:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    AddTypeStr(act_cell_);
+    return base.c_str();
+  }
+  void genCode() override;
+
+ protected:
+  int num_;
+  bool compute_c1h1_;
+  bool use_peephole_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  operand_type act_cell_;
+  reg64_t param1{abi_param1};
+};
+
+#define DECLARE_LSTM_JITCODE(name, compute_c1h1)                      \
+  class name##JitCode : public LSTMJitCode {                          \
+   public:                                                            \
+    explicit name##JitCode(const lstm_attr_t& attr, size_t code_size, \
+                           void* code_ptr = nullptr)                  \
+        : LSTMJitCode(compute_c1h1, attr, code_size, code_ptr) {}     \
+  };
+
+DECLARE_LSTM_JITCODE(LSTMCtHt, false);
+DECLARE_LSTM_JITCODE(LSTMC1H1, true);
+
+#undef DECLARE_LSTM_JITCODE
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..310da0c76f1ab251d788e54f2305f375f3fb4838
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen_base.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+// refer do not need useme, it would be the last one.
+void GenBase::dumpCode(const unsigned char* code) const {
+  if (code) {
+    static int counter = 0;
+    std::ostringstream filename;
+    filename << "paddle_jitcode_" << name() << "." << counter << ".bin";
+    counter++;
+    std::ofstream fout(filename.str(), std::ios::out);
+    if (fout.is_open()) {
+      fout.write(reinterpret_cast<const char*>(code), this->getSize());
+      fout.close();
+    }
+  }
+}
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..4af01a437670aa6a07d370ff23ed2abd369f69a3
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include <memory>  // for unique_ptr
+#include "paddle/fluid/operators/jit/kernel_base.h"
+
+DECLARE_bool(dump_jitcode);
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+class GenBase : public Kernel {
+ public:
+  virtual ~GenBase() = default;
+  virtual const char* name() const = 0;
+  virtual size_t getSize() const = 0;
+  virtual const unsigned char* getCodeInternal() = 0;
+  template <typename Func>
+  Func getCode() {
+    const unsigned char* code = this->getCodeInternal();
+    if (FLAGS_dump_jitcode) {
+      this->dumpCode(code);
+    }
+    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
+    // then workaround with const_cast. Any better idea is appreciated.
+    return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
+  }
+
+ protected:
+  void dumpCode(const unsigned char* code) const;
+};
+
+// Creator is used to creat the jitcode and save in pool.
+// Every JitCode should have one creator.
+class GenCreator {
+ public:
+  virtual ~GenCreator() = default;
+};
+
+template <typename Attr>
+class JitCodeCreator : public GenCreator {
+ public:
+  virtual ~JitCodeCreator() = default;
+
+  // condition when this jit code can be used.
+  virtual bool UseMe(const Attr& attr) const = 0;
+
+  // estimate this code size
+  virtual size_t CodeSize(const Attr& attr) const = 0;
+
+  // create this code
+  virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
+};
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d00584baa081c21762774aef4cbbc714d49cd012
--- /dev/null
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/helper.h"
+#include <algorithm>  // tolower
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+#define ONE_CASE(key) \
+  case key:           \
+    return #key
+
+const char* to_string(KernelType kt) {
+  switch (kt) {
+    ONE_CASE(kVMul);
+    ONE_CASE(kVAdd);
+    ONE_CASE(kVAddRelu);
+    ONE_CASE(kVSub);
+    ONE_CASE(kVScal);
+    ONE_CASE(kVAddBias);
+    ONE_CASE(kVRelu);
+    ONE_CASE(kVIdentity);
+    ONE_CASE(kVExp);
+    ONE_CASE(kVSigmoid);
+    ONE_CASE(kVTanh);
+    ONE_CASE(kLSTMCtHt);
+    ONE_CASE(kLSTMC1H1);
+    ONE_CASE(kGRUH1);
+    ONE_CASE(kGRUHtPart1);
+    ONE_CASE(kGRUHtPart2);
+    ONE_CASE(kCRFDecoding);
+    ONE_CASE(kLayerNorm);
+    ONE_CASE(kNCHW16CMulNC);
+    default:
+      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
+      return "NOT JITKernel";
+  }
+  return nullptr;
+}
+#undef ONE_CASE
+
+KernelType to_kerneltype(const std::string& act) {
+  std::string lower = act;
+  std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+  if (lower == "relu" || lower == "vrelu") {
+    return kVRelu;
+  } else if (lower == "identity" || lower == "videntity" || lower == "") {
+    return kVIdentity;
+  } else if (lower == "exp" || lower == "vexp") {
+    return kVExp;
+  } else if (lower == "sigmoid" || lower == "vsigmoid") {
+    return kVSigmoid;
+  } else if (lower == "tanh" || lower == "vtanh") {
+    return kVTanh;
+  }
+  PADDLE_THROW("Not support type: %s, or forget to add this case", act);
+  return kNone;
+}
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..412df86aa1cd94871989aef25adef803f673812b
--- /dev/null
+++ b/paddle/fluid/operators/jit/helper.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/jit/gen_base.h"
+#include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/operators/jit/kernel_key.h"
+#include "paddle/fluid/operators/jit/kernel_pool.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+template <KernelType KT, typename KernelTuples, typename PlaceType>
+inline typename std::enable_if<
+    std::is_same<typename KernelTuples::data_type, float>::value &&
+        std::is_same<PlaceType, platform::CPUPlace>::value,
+    typename KernelTuples::func_type>::type
+GetJitCode(const typename KernelTuples::attr_type& attr) {
+  using Func = typename KernelTuples::func_type;
+  using Attr = typename KernelTuples::attr_type;
+  size_t key = JitCodeKey<Attr>(attr);
+  auto& codes = JitCodePool<KT>().Instance();
+  if (codes.Has(key)) {
+    return codes.AllKernels().at(key)->template getCode<Func>();
+  }
+
+  // creator is not related with attr, so can use KernelKey as key
+  KernelKey kkey(KT, PlaceType());
+  // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
+  auto& creator_map = JitCodeCreatorPool().Instance().AllCreators();
+  auto iter = creator_map.find(kkey);
+  if (iter != creator_map.end()) {
+    auto& creators = iter->second;
+    for (auto& cur : creators) {
+      auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
+      if (i && i->UseMe(attr)) {
+        auto p = i->CreateJitCode(attr);
+        if (p) {
+          auto f = p->template getCode<Func>();
+          codes.Insert(key, std::move(p));
+          return f;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+template <KernelType KT, typename KernelTuples, typename PlaceType>
+inline typename std::enable_if<
+    !std::is_same<typename KernelTuples::data_type, float>::value ||
+        !std::is_same<PlaceType, platform::CPUPlace>::value,
+    typename KernelTuples::func_type>::type
+GetJitCode(const typename KernelTuples::attr_type& attr) {
+  return nullptr;
+}
+
+// Refer code do not related with attr, which is just for cast
+// Refer is always on CPUPlace
+template <KernelType KT, typename KernelTuples>
+inline typename KernelTuples::func_type GetRefer() {
+  auto& ref_pool = ReferKernelPool().Instance().AllKernels();
+  KernelKey kkey(KT, platform::CPUPlace());
+  auto ref_iter = ref_pool.find(kkey);
+  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
+                 "Every Kernel should have reference function.");
+  auto& ref_impls = ref_iter->second;
+  for (auto& impl : ref_impls) {
+    auto i = dynamic_cast<const ReferKernel<KernelTuples>*>(impl.get());
+    if (i) {
+      return i->GetFunc();
+    }
+  }
+  return nullptr;
+}
+
+template <KernelType KT, typename KernelTuples,
+          typename PlaceType = platform::CPUPlace>
+typename KernelTuples::func_type Get(
+    const typename KernelTuples::attr_type& attr) {
+  auto jitfunc = GetJitCode<KT, KernelTuples, PlaceType>(attr);
+  if (jitfunc) {
+    return jitfunc;
+  }
+
+  // pool: (KernelKey(type, place), vector<KernelPtr>)
+  KernelKey kkey(KT, PlaceType());
+  auto& pool = KernelPool().Instance().AllKernels();
+  auto iter = pool.find(kkey);
+  if (iter != pool.end()) {
+    auto& impls = iter->second;
+    for (auto& impl : impls) {
+      auto i = dynamic_cast<const KernelMore<KernelTuples>*>(impl.get());
+      if (i && i->UseMe(attr)) {
+        return i->GetFunc();
+      }
+    }
+  }
+
+  // The last implementation should be reference function on CPUPlace.
+  return GetRefer<KT, KernelTuples>();
+}
+
+const char* to_string(KernelType kt);
+
+KernelType to_kerneltype(const std::string& act);
+
+inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
+  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
+     << "],act_cand[" << to_string(attr.act_cand) << "],act_cell["
+     << to_string(attr.act_cell) << "],use_peephole["
+     << (attr.use_peephole ? "True" : "False") << "]";
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
+  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
+     << "],act_cand[" << to_string(attr.act_cand) << "]";
+  return os;
+}
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4a2d5d47301a2fd82bf27ddfaaa31ef23e431c2
--- /dev/null
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/jit/macro.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+typedef enum {
+  kNone = 0,
+  kVMul = 1,
+  kVAdd = 2,
+  kVAddRelu,
+  kVSub,
+  kVScal,
+  kVAddBias,
+  kVRelu,
+  kVIdentity,
+  kVExp,
+  kVSigmoid,
+  kVTanh,
+  kLSTMCtHt,
+  kLSTMC1H1,
+  kGRUH1,
+  kGRUHtPart1,
+  kGRUHtPart2,
+  kCRFDecoding,
+  kLayerNorm,
+  kNCHW16CMulNC,
+} KernelType;
+
+template <typename T>
+struct XYZNTuples {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, int);
+};
+
+template <typename T>
+struct AXYNTuples : public XYZNTuples<T> {};
+
+template <typename T>
+struct XYNTuples {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, T*, int);
+};
+
+typedef struct {
+  void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
+  const void* ct_1;
+  void* ct;
+  void* ht;
+  /* weight_peephole and checked data are only used in peephole*/
+  const void* wp{nullptr};  //  W_ic, W_fc, W_oc
+  void* checked{nullptr};   // size: 2 * d
+} lstm_t;
+
+typedef struct {
+  void* gates;  // gates: {x_update, x_reset; x_state}
+  const void* ht_1;
+  void* ht;
+} gru_t;
+
+struct rnn_attr_s {
+  int d;
+  KernelType act_gate, act_cand;
+  rnn_attr_s() = default;
+  explicit rnn_attr_s(int _d, KernelType _act_gate, KernelType _act_cand)
+      : d(_d), act_gate(_act_gate), act_cand(_act_cand) {}
+};
+
+struct lstm_attr_s : public rnn_attr_s {
+  bool use_peephole;
+  KernelType act_cell;
+  lstm_attr_s() = default;
+  explicit lstm_attr_s(int _d, KernelType _act_gate, KernelType _act_cand,
+                       KernelType _act_cell, bool _use_peephole = false)
+      : rnn_attr_s(_d, _act_gate, _act_cand),
+        use_peephole(_use_peephole),
+        act_cell(_act_cell) {}
+};
+
+typedef struct rnn_attr_s gru_attr_t;
+typedef struct lstm_attr_s lstm_attr_t;
+
+template <typename T>
+struct LSTMTuples {
+  typedef T data_type;
+  typedef lstm_attr_t attr_type;
+  typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
+};
+
+template <typename T>
+struct GRUTuples {
+  typedef T data_type;
+  typedef gru_attr_t attr_type;
+  typedef void (*func_type)(gru_t*, const gru_attr_t*);
+};
+
+template <typename T>
+struct CRFDecodingTuples {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
+};
+
+template <typename T>
+struct LayerNormTuples {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int,
+                            const float, int);
+};
+
+// nChw16c = nChw16c .* NC
+template <typename T>
+struct NCHW16CMulNCTuples {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, int, int);
+};
+
+// Just for adding to kernel pool without template
+class Kernel {
+ public:
+  Kernel() = default;
+  virtual ~Kernel() = default;
+  DISABLE_COPY_AND_ASSIGN(Kernel);
+};
+
+template <typename KernelTuples>
+class KernelMore : public Kernel {
+ public:
+  using T = typename KernelTuples::data_type;
+  using Func = typename KernelTuples::func_type;
+  using Attr = typename KernelTuples::attr_type;
+  virtual Func GetFunc() const { return func; }
+  virtual bool UseMe(const Attr& attr) const = 0;
+  virtual const char* ImplType() const = 0;
+
+ protected:
+  Func func{nullptr};
+};
+
+template <typename KernelTuples>
+class ReferKernel : public KernelMore<KernelTuples> {
+ public:
+  // Refer code can always be used
+  bool UseMe(const typename KernelTuples::attr_type& attr) const override {
+    return true;
+  }
+  const char* ImplType() const override { return "Refer"; }
+};
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e6a19f04fd425b920aeea49b63001941d800a73
--- /dev/null
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/kernel_key.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+template <>
+size_t JitCodeKey<int>(const int& d) {
+  return d;
+}
+
+constexpr int act_type_shift = 3;  // suppot 2^3 act types
+
+template <>
+size_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
+  size_t key = attr.d;
+  int gate_key = static_cast<int>(attr.act_gate) << 1;
+  int cand_key = static_cast<int>(attr.act_cand) << (1 + act_type_shift);
+  int cell_key = static_cast<int>(attr.act_cell) << (1 + act_type_shift * 2);
+  return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key +
+         attr.use_peephole;
+}
+
+template <>
+size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
+  size_t key = attr.d;
+  return (key << (act_type_shift * 2)) + static_cast<int>(attr.act_gate) +
+         (static_cast<int>(attr.act_cand) << act_type_shift);
+}
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..611a0210d614196ad0b05d583303688c1d964e04
--- /dev/null
+++ b/paddle/fluid/operators/jit/kernel_key.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+struct KernelKey {
+  struct Hash {
+    size_t operator()(const KernelKey& key) const {
+      int place = key.place_.which();               // less than 2^8
+      int type = static_cast<int>(key.type_) << 8;  // less than 2^(32-8)
+      std::hash<int> hasher;
+      return hasher(place + type);
+    }
+  };
+
+  KernelType type_;
+  platform::Place place_;
+
+  KernelKey(KernelType type, platform::Place place)
+      : type_(type), place_(place) {}
+  size_t hash_key() const { return Hash()(*this); }
+
+  bool operator==(const KernelKey& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           type_ == o.type_;
+  }
+  bool operator!=(const KernelKey& o) const { return !(*this == o); }
+};
+
+// Every JitCode should have a method to get the key from attribution
+template <typename Attr>
+size_t JitCodeKey(const Attr& attr);
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_pool.cc b/paddle/fluid/operators/jit/kernel_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc98c644fbee2cd54faf4dc9fe151b8be131bd7b
--- /dev/null
+++ b/paddle/fluid/operators/jit/kernel_pool.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/kernel_pool.h"
+#include <memory>  // for shared_ptr
+#include <string>
+#include <unordered_map>
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+JitCodeCreatorPool& JitCodeCreatorPool::Instance() {
+  static JitCodeCreatorPool g_creator_pool;
+  return g_creator_pool;
+}
+
+KernelPool& KernelPool::Instance() {
+  static KernelPool g_kernel_pool;
+  return g_kernel_pool;
+}
+
+ReferKernelPool& ReferKernelPool::Instance() {
+  static ReferKernelPool g_refer_kernel_pool;
+  return g_refer_kernel_pool;
+}
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e15242af28839ee0759e1a5b3930d6d6bfaa0ff
--- /dev/null
+++ b/paddle/fluid/operators/jit/kernel_pool.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <memory>  // for unique_ptr
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/operators/jit/gen_base.h"
+#include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/operators/jit/kernel_key.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+template <KernelType KT>
+class JitCodePool {
+  typedef std::unique_ptr<GenBase> GenBasePtr;
+  typedef std::unordered_map<size_t, GenBasePtr> JitCodeMap;
+
+ public:
+  JitCodePool() = default;
+  static JitCodePool& Instance() {
+    static thread_local JitCodePool<KT> g_jit_codes;
+    return g_jit_codes;
+  }
+
+  const JitCodeMap& AllKernels() { return codes_; }
+
+  bool Has(size_t key) const { return codes_.find(key) != codes_.end(); }
+
+  void Insert(size_t key, GenBasePtr value) {
+    codes_.emplace(key, std::move(value));
+  }
+
+ private:
+  JitCodeMap codes_;
+  DISABLE_COPY_AND_ASSIGN(JitCodePool);
+};
+
+class JitCodeCreatorPool {
+  typedef std::unique_ptr<const GenCreator> GenCreatorPtr;
+  typedef std::unordered_map<KernelKey, std::vector<GenCreatorPtr>,
+                             KernelKey::Hash>
+      GenCreatorPtrMap;
+
+ public:
+  JitCodeCreatorPool() = default;
+  static JitCodeCreatorPool& Instance();
+  GenCreatorPtrMap& AllCreators() { return creators_; }
+  void Insert(const KernelKey& key, GenCreatorPtr value) {
+    if (creators_.find(key) == creators_.end()) {
+      creators_.emplace(key, std::vector<GenCreatorPtr>());
+    }
+    creators_.at(key).emplace_back(std::move(value));
+  }
+
+ private:
+  GenCreatorPtrMap creators_;
+  DISABLE_COPY_AND_ASSIGN(JitCodeCreatorPool);
+};
+
+typedef std::unique_ptr<const Kernel> KernelPtr;
+typedef std::unordered_map<KernelKey, std::vector<KernelPtr>, KernelKey::Hash>
+    KernelMap;
+
+class KernelPool {
+ public:
+  static KernelPool& Instance();
+  KernelPool() = default;
+  KernelMap& AllKernels() { return pool_; }
+  void Insert(const KernelKey& key, KernelPtr value) {
+    if (pool_.find(key) == pool_.end()) {
+      pool_.emplace(key, std::vector<KernelPtr>());
+    }
+    pool_.at(key).emplace_back(std::move(value));
+  }
+
+ private:
+  KernelMap pool_;
+  DISABLE_COPY_AND_ASSIGN(KernelPool);
+};
+
+// Every kernel should have refer code and it should be used in unit tests,
+// so refer kernels should have it's independent kernel pool
+class ReferKernelPool {
+ public:
+  static ReferKernelPool& Instance();
+  ReferKernelPool() = default;
+  KernelMap& AllKernels() { return pool_; }
+  void Insert(const KernelKey& key, KernelPtr value) {
+    if (pool_.find(key) == pool_.end()) {
+      pool_.emplace(key, std::vector<KernelPtr>());
+    }
+    pool_.at(key).emplace_back(std::move(value));
+  }
+
+ private:
+  KernelMap pool_;
+  DISABLE_COPY_AND_ASSIGN(ReferKernelPool);
+};
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/macro.h b/paddle/fluid/operators/jit/macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2622eba8b70cc553a2da44638d577c9d7751b25
--- /dev/null
+++ b/paddle/fluid/operators/jit/macro.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fa503356baa73cb76e50ff19901a56d0c987ad99
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+function(USE_JITKERNEL_MORE TARGET TYPE)
+    file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
+endfunction()
+
+if(WITH_MKLML)
+    add_subdirectory(mkl)
+endif()
+
+if(WITH_AVX)
+    add_subdirectory(intrinsic)
+endif()
+
+# mix should be last
+add_subdirectory(mix)
+
+set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..468937a4f6b27ae525bfd0d8e99cc891eedbc353
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
@@ -0,0 +1,9 @@
+
+file(GLOB jit_kernel_cc_intrinsic RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_base)
+
+set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
+
+# use mkl kernels by name and type
+USE_JITKERNEL_MORE(kCRFDecoding, intrinsic)
+USE_JITKERNEL_MORE(kLayerNorm, intrinsic)
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16c91f8246dda34b1436fd4edd507e9ff603de6b
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h"
+#include <limits>
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace more {
+namespace intrinsic {
+// Note: intrinsic code is not runtime build.
+// For example, if you build code on AVX, and run on AVX512 it can only use AVX
+
+void CRFDecoding(const int seq_len, const float* x, const float* w,
+                 float* alpha, int* track, int tag_num) {
+#ifdef __AVX512F__
+  const int step_size = ZMM_FLOAT_BLOCK;
+#else
+  const int step_size = YMM_FLOAT_BLOCK;
+#endif
+  const int end = tag_num / step_size;
+  const int rest = tag_num % step_size;
+  /* Setup the alpha initial value.*/
+  int i_offset = 0;
+  int last_offset = rest - step_size;
+  for (int i = 0; i <= end; ++i) {
+#ifdef __AVX512F__
+    // Declare the variable for the content of weights, input and alpha values.
+    __m512 w_content, x_content, alpha_content;
+    // Load the relevant data into the variables from un-aligned address.
+    w_content = _mm512_loadu_ps(w + i_offset);
+    x_content = _mm512_loadu_ps(x + i_offset);
+    alpha_content = _mm512_add_ps(w_content, x_content);
+    // Save the alpha value.
+    _mm512_storeu_ps(alpha_value + i_offset, alpha_content);
+#else
+    // AVX or AVX2
+    // weights, input and alpha values.
+    __m256 w_content, x_content, alpha_content;
+    // Load the relevant data into the variables from un-aligned address.
+    w_content = _mm256_loadu_ps(w + i_offset);
+    x_content = _mm256_loadu_ps(x + i_offset);
+    alpha_content = _mm256_add_ps(w_content, x_content);
+    _mm256_storeu_ps(alpha + i_offset, alpha_content);
+#endif
+    i_offset += step_size;
+    if (i == end - 1) {
+      if (rest > 0) {
+        i_offset += last_offset;
+      } else {
+        break;
+      }
+    }
+  }
+  // Use the column-major strategy to get the location of maximum score.
+  int seq_offset = 0;
+  constexpr int state_trans_base_idx = 2;
+  for (int k = 1; k < seq_len; ++k) {
+    int j_offset = 0;
+    for (int j = 0; j <= end; ++j) {
+/* Initialize the variables of maximum score and location.*/
+#ifdef __AVX512F__
+      __m512 max_score = _mm512_set1_ps(-std::numeric_limits<float>::max());
+      __m512i max_j = _mm512_setzero_si512();
+#else
+      __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max());
+      __m256i max_j = _mm256_set1_epi32(0);
+#endif
+      /* Calculate the offset of transition_weights.*/
+      int trans_offset = state_trans_base_idx * tag_num + j_offset;
+      for (int i = 0; i < tag_num; ++i) {
+/* Initalize the content of alpha variable with related offset.*/
+#ifdef __AVX512F__
+        __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i));
+        /* Obtain the content of weights from un-aligned address.*/
+        __m512 w_content = _mm512_loadu_ps(w + trans_offset);
+        __m512 score_v = _mm512_add_ps(alpha_content, w_content);
+        __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
+        /* AVX512 instructions.*/
+        max_j = _mm512_mask_set1_epi32(max_j, mask, i);
+        /* Update the max_score value.*/
+        max_score = _mm512_max_ps(max_score, score_v);
+
+#else
+        __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);
+        /* Obtain the content of weights from un-aligned address.*/
+        __m256 w_content = _mm256_loadu_ps(w + trans_offset);
+        __m256 score_v = _mm256_add_ps(alpha_content, w_content);
+        __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
+/* According to the mask value, update the index of the max_score.*/
+#ifdef __AVX2__
+        max_j = _mm256_or_si256(
+            _mm256_andnot_si256((__m256i)mask, max_j),
+            _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
+#else
+        __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
+        __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
+        __m128i lo_mask =
+            _mm256_extractf128_si256(*(__m256i*)&mask, 0);  // NOLINT
+        __m128i hi_mask =
+            _mm256_extractf128_si256(*(__m256i*)&mask, 1);  // NOLINT
+        lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
+        hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
+        lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
+        hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
+        lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
+        hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
+        max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
+        max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
+#endif
+        /* Update the max_score value.*/
+        max_score = _mm256_max_ps(max_score, score_v);
+
+#endif
+
+        trans_offset += tag_num;
+      }
+/* Update the alpha and track values. */
+#ifdef __AVX512F__
+      __m512 x_content =
+          _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset);
+      max_score = _mm512_add_ps(max_score, x_content);
+      _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score);
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset +
+                                                     this->num_ + j_offset),
+                          max_j);
+#else
+      __m256 x_content = _mm256_loadu_ps(x + seq_offset + tag_num + j_offset);
+      max_score = _mm256_add_ps(max_score, x_content);
+      _mm256_storeu_ps(alpha + seq_offset + tag_num + j_offset, max_score);
+      _mm256_storeu_si256(
+          reinterpret_cast<__m256i*>(track + seq_offset + tag_num + j_offset),
+          max_j);
+#endif
+
+      /* Calculate the offset of next step*/
+      j_offset += step_size;
+      if (j == end - 1) {
+        if (rest > 0) {
+          j_offset += last_offset;
+        } else {
+          break;
+        }
+      }
+    }
+    seq_offset += tag_num;
+  }
+}
+
+bool CRFDecodingKernel::UseMe(const int& d) const {
+#ifdef __AVX512F__
+  constexpr int block = ZMM_FLOAT_BLOCK;
+#else
+  constexpr int block = YMM_FLOAT_BLOCK;
+#endif
+  return platform::MayIUse(platform::avx) && d >= block;
+}
+
+}  // namespace intrinsic
+}  // namespace more
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace intrinsic = paddle::operators::jit::more::intrinsic;
+
+REGISTER_JITKERNEL_MORE(kCRFDecoding, intrinsic, intrinsic::CRFDecodingKernel);
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
new file mode 100644
index 0000000000000000000000000000000000000000..24179d90ddcc6e7f44ffa4b2ca0886fbca5c81bf
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include "paddle/fluid/operators/jit/kernel_base.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace more {
+namespace intrinsic {
+
+void CRFDecoding(const int seq_len, const float* x, const float* w,
+                 float* alpha, int* track, int tag_num);
+
+class CRFDecodingKernel : public KernelMore<CRFDecodingTuples<float>> {
+ public:
+  CRFDecodingKernel() { this->func = CRFDecoding; }
+  bool UseMe(
+      const typename CRFDecodingTuples<float>::attr_type&) const override;
+  const char* ImplType() const override { return "Intrinsic"; }
+};
+
+}  // namespace intrinsic
+}  // namespace more
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9b6e401c6825b21191881d4e57fe09b48d2f4ee
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/more/intrinsic/layer_norm.h"
+#include <limits>
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace more {
+namespace intrinsic {
+
+void LayerNorm(float* x, float* out, float* mean, float* var,
+               const float* scale, const float* bias, int height,
+               const float epsilon, int right) {
+  __m256 sum;
+  __m256 mean_vec, var_vec;
+  __m128 hi, lo;
+  __m256 tmp;
+  size_t offset;
+  size_t j;
+  int block = YMM_FLOAT_BLOCK;
+  const int rest = right % block;
+  const int end = right - rest;
+
+  __m256 reverse_num_vec =
+      _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(right));
+  __m256 epsilon_vec = _mm256_set1_ps(epsilon);
+  int rest_mask =
+      ((-1) & (~((~0U) >> (sizeof(int) * 8 - (block - rest))))) & 0x0ff;
+  __m256i mask_vec = _mm256_set_epi32(
+      rest_mask & 0x80 ? 0xffffffff : 0, rest_mask & 0x40 ? 0xffffffff : 0,
+      rest_mask & 0x20 ? 0xffffffff : 0, rest_mask & 0x10 ? 0xffffffff : 0,
+      rest_mask & 0x8 ? 0xffffffff : 0, rest_mask & 0x4 ? 0xffffffff : 0,
+      rest_mask & 0x2 ? 0xffffffff : 0, rest_mask & 0x1 ? 0xffffffff : 0);
+
+  for (int i = 0; i < height; ++i) {
+    offset = i * right;
+
+    /* get mean */
+    sum = _mm256_setzero_ps();
+    for (j = offset; j < end + offset; j += block) {
+      sum = _mm256_add_ps(sum, _mm256_loadu_ps((const float*)x + j));
+    }
+    if (rest != 0) {
+      j = offset + right - block;
+      tmp = _mm256_loadu_ps((const float*)x + j);
+      tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp,
+                             *(__m256*)&mask_vec);  // NOLINT
+      sum = _mm256_add_ps(sum, tmp);
+    }
+    hi = _mm256_extractf128_ps(sum, 1);
+    lo = _mm256_extractf128_ps(sum, 0);
+    sum = _mm256_add_ps(
+        sum, _mm256_insertf128_ps(
+                 _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));
+    sum = _mm256_hadd_ps(sum, sum);
+    sum = _mm256_hadd_ps(sum, sum);
+    mean_vec = _mm256_mul_ps(sum, reverse_num_vec);
+    mean[i] = *reinterpret_cast<float*>(&mean_vec);
+
+    /* get variance */
+    sum = _mm256_setzero_ps();
+    for (j = offset; j < end + offset; j += block) {
+      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
+      tmp = _mm256_mul_ps(tmp, tmp);
+      sum = _mm256_add_ps(sum, tmp);
+    }
+    if (rest != 0) {
+      j = offset + right - block;
+      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
+      tmp = _mm256_mul_ps(tmp, tmp);
+      tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp,
+                             *(__m256*)&mask_vec);  // NOLINT
+      sum = _mm256_add_ps(sum, tmp);
+    }
+    hi = _mm256_extractf128_ps(sum, 1);
+    lo = _mm256_extractf128_ps(sum, 0);
+    sum = _mm256_add_ps(
+        sum, _mm256_insertf128_ps(
+                 _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));
+    sum = _mm256_hadd_ps(sum, sum);
+    sum = _mm256_hadd_ps(sum, sum);
+    var_vec = _mm256_mul_ps(sum, reverse_num_vec);
+    var[i] = *reinterpret_cast<float*>(&var_vec);
+
+    /* get x_norm and calculate output*/
+    for (j = offset; j < end + offset; j += block) {
+      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
+      tmp = _mm256_div_ps(tmp,
+                          _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));
+      _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);
+    }
+    if (rest != 0) {
+      j = offset + right - block;
+      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
+      tmp = _mm256_div_ps(tmp,
+                          _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));
+      _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);
+    }
+
+    if (scale) {
+      if (rest != 0) {
+        j = offset + right - block;
+        tmp = _mm256_loadu_ps((const float*)out + j);
+      }
+      for (j = offset; j < end + offset; j += block) {
+        _mm256_storeu_ps(
+            reinterpret_cast<float*>(out) + j,
+            _mm256_mul_ps(_mm256_loadu_ps((const float*)out + j),
+                          _mm256_loadu_ps((const float*)scale + j - offset)));
+      }
+      if (rest != 0) {
+        j = offset + right - block;
+        _mm256_storeu_ps(
+            reinterpret_cast<float*>(out) + j,
+            _mm256_mul_ps(tmp,
+                          _mm256_loadu_ps((const float*)scale + j - offset)));
+      }
+    }
+
+    if (bias) {
+      if (rest != 0) {
+        j = offset + right - block;
+        tmp = _mm256_loadu_ps((const float*)out + j);
+      }
+      for (j = offset; j < end + offset; j += block) {
+        _mm256_storeu_ps(
+            reinterpret_cast<float*>(out) + j,
+            _mm256_add_ps(_mm256_loadu_ps((const float*)out + j),
+                          _mm256_loadu_ps((const float*)bias + j - offset)));
+      }
+      if (rest != 0) {
+        j = offset + right - block;
+        _mm256_storeu_ps(reinterpret_cast<float*>(out) + j,
+                         _mm256_add_ps(tmp, _mm256_loadu_ps((const float*)bias +
+                                                            j - offset)));
+      }
+    }
+  }
+}
+
+bool LayerNormKernel::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK;
+}
+
+}  // namespace intrinsic
+}  // namespace more
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace intrinsic = paddle::operators::jit::more::intrinsic;
+
+REGISTER_JITKERNEL_MORE(kLayerNorm, intrinsic, intrinsic::LayerNormKernel);
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..89da2940f4420c418f9bd5260c4b74606cc9168f
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include "paddle/fluid/operators/jit/kernel_base.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace more {
+namespace intrinsic {
+
+void LayerNorm(float* x, float* out, float* mean, float* var,
+               const float* scale, const float* bias, int height,
+               const float epsilon, int right);
+
+class LayerNormKernel : public KernelMore<LayerNormTuples<float>> {
+ public:
+  LayerNormKernel() { this->func = LayerNorm; }
+  bool UseMe(const typename LayerNormTuples<float>::attr_type&) const override;
+  const char* ImplType() const override { return "Intrinsic"; }
+};
+
+}  // namespace intrinsic
+}  // namespace more
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e05f204b1eebd03c7a00157d96d0482f4a44a7fb
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+
+file(GLOB jit_kernel_mix_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base)
+
+set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE)
+
+USE_JITKERNEL_MORE(kVSigmoid, mix)
+USE_JITKERNEL_MORE(kVTanh, mix)
+USE_JITKERNEL_MORE(kLSTMCtHt, mix)
+USE_JITKERNEL_MORE(kLSTMC1H1, mix)
+USE_JITKERNEL_MORE(kGRUH1, mix)
+USE_JITKERNEL_MORE(kGRUHtPart1, mix)
+USE_JITKERNEL_MORE(kGRUHtPart2, mix)
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df0a85256b1f546d5f64be73925cf58b87a25bd7
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/more/mix/mix.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace more {
+namespace mix {
+
+void VSigmoid(const T* x, T* y, int n) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  auto compute = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
+  compute(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+void VTanh(const T* x, T* y, int n) {
+  const T a = 2, b = -1;
+  auto compute_scal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
+  auto compute_addbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
+  auto compute_sigmoid = Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(n);
+  compute_scal(&a, x, y, n);
+  compute_sigmoid(y, y, n);
+  compute_scal(&a, y, y, n);
+  compute_addbias(&b, y, y, n);
+}
+
+void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
+  if (type == kVSigmoid) {
+    return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d);
+  } else if (type == kVRelu) {
+    return Get<kVRelu, XYNTuples<T>, platform::CPUPlace>(d);
+  } else if (type == kVTanh) {
+    return Get<kVTanh, XYNTuples<T>, platform::CPUPlace>(d);
+  } else if (type == kVIdentity) {
+    return Get<kVIdentity, XYNTuples<T>, platform::CPUPlace>(d);
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
+
+void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* wp = reinterpret_cast<const T*>(step->wp);
+  T* checked = reinterpret_cast<T*>(step->checked);
+  const int d = attr->d;
+  const int d2 = d * 2;
+  const int d3 = d * 3;
+  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vadd_d = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vadd_d2 = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d2);
+  auto act_gate_d = getActFunc(attr->act_gate, d);
+  auto act_gate_d2 = getActFunc(attr->act_gate, d2);
+  auto act_gate_d3 = getActFunc(attr->act_gate, d3);
+  auto act_cand_d = getActFunc(attr->act_cand, d);
+  auto act_cell_d = getActFunc(attr->act_cell, d);
+
+  if (attr->use_peephole) {
+    vmul_d(wp, ct_1, checked, d);
+    vmul_d(wp + d, ct_1, checked + d, d);
+    vadd_d2(checked, gates + d, gates + d, d2);
+    act_gate_d2(gates + d, gates + d, d2);
+  } else {
+    act_gate_d3(gates + d, gates + d, d3);
+  }
+
+  // C_t = C_t-1 * fgated + cand_gated * igated
+  act_cand_d(gates, gates, d);
+  vmul_d(gates, gates + d, gates + d, d);
+  vmul_d(ct_1, gates + d2, gates + d2, d);
+  vadd_d(gates + d, gates + d2, ct, d);
+
+  if (attr->use_peephole) {
+    // get ogated
+    vmul_d(wp + d2, ct, gates + d, d);
+    vadd_d(gates + d, gates + d3, gates + d3, d);
+    act_gate_d(gates + d3, gates + d3, d);
+  }
+  // H_t = act_cell(C_t) * ogated
+  act_cell_d(ct, gates + d2, d);
+  vmul_d(gates + d2, gates + d3, ht, d);
+}
+
+void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto vadd_d = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d);
+  auto act_gate_d = getActFunc(attr->act_gate, d);
+  auto act_cand_d = getActFunc(attr->act_cand, d);
+  auto act_cell_d = getActFunc(attr->act_cell, d);
+  /* C_t = igated * cgated*/
+  act_gate_d(gates + d, gates + d, d);
+  act_cand_d(gates, gates, d);
+  vmul_d(gates, gates + d, ct, d);
+  if (attr->use_peephole) {
+    // get outgated, put W_oc * C_t on igated
+    const T* wp = reinterpret_cast<const T*>(step->wp);
+    vmul_d(wp + d2, ct, gates + d, d);
+    vadd_d(gates + d, gates + d3, gates + d3, d);
+  }
+  /* H_t = act_cell(C_t) * ogated */
+  act_gate_d(gates + d3, gates + d3, d);
+  act_cell_d(ct, gates + d2, d);
+  vmul_d(gates + d2, gates + d3, ht, d);
+}
+
+// compute h1 without h0
+void GRUH1(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  int d = attr->d;
+  int d2 = d * 2;
+  auto act_gate = getActFunc(attr->act_gate, d);
+  auto act_cand = getActFunc(attr->act_cand, d);
+  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d);
+  act_gate(gates, gates, d);
+  act_cand(gates + d2, gates + d2, d);
+  vmul_d(gates, gates + d2, ht, d);
+}
+
+// compute the first part of GRU: ht = act_gate(r) * ht_1
+void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
+  // W: {W_update, W_reset; W_state}
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc(attr->act_gate, attr->d);
+  auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(attr->d);
+  act_gate(gates + attr->d, gates + attr->d, attr->d);
+  vmul_d(ht_1, gates + attr->d, ht, attr->d);
+}
+
+// compute the second part of GRU:
+// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  int d = attr->d;
+  auto act_gate = getActFunc(attr->act_gate, d);
+  auto act_cand = getActFunc(attr->act_cand, d);
+  T* y = gates + d * 2;
+  act_gate(gates, gates, d);
+  act_cand(y, y, d);
+  // out = zt*ht~ + (1-zt)*ht_1
+  for (int i = 0; i < d; ++i) {
+    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+  }
+}
+
+// TODO(TJ): tuning me
+bool VSigmoidKernel::UseMe(const int& d) const { return true; }
+
+bool VTanhKernel::UseMe(const int& d) const { return true; }
+
+bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; }
+
+bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; }
+
+bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+
+bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+
+bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }
+
+}  // namespace mix
+}  // namespace more
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace mix = paddle::operators::jit::more::mix;
+
+#define REGISTER_MORE_KERNEL(key, func) \
+  REGISTER_JITKERNEL_MORE(key, mix, mix::func##Kernel)
+
+REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid);
+REGISTER_MORE_KERNEL(kVTanh, VTanh);
+REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt);
+REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1);
+REGISTER_MORE_KERNEL(kGRUH1, GRUH1);
+REGISTER_MORE_KERNEL(kGRUHtPart1, GRUHtPart1);
+REGISTER_MORE_KERNEL(kGRUHtPart2, GRUHtPart2);
+
+#undef REGISTER_MORE_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h
new file mode 100644
index 0000000000000000000000000000000000000000..a70ecdf9348f511311307b4c27bb4506222a7439
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include "paddle/fluid/operators/jit/kernel_base.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace more {
+namespace mix {
+using T = float;
+
+void VSigmoid(const T* x, T* y, int n);
+void VTanh(const T* x, T* y, int n);
+
+void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
+void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
+void GRUH1(gru_t* step, const gru_attr_t* attr);
+void GRUHtPart1(gru_t* step, const gru_attr_t* attr);
+void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
+
+#define DECLARE_MORE_KERNEL(name, tuples)                            \
+  class name##Kernel : public KernelMore<tuples<T>> {                \
+   public:                                                           \
+    name##Kernel() { this->func = name; }                            \
+    bool UseMe(const typename tuples<T>::attr_type&) const override; \
+    const char* ImplType() const override { return "Mixed"; }        \
+  }
+
+// XYN
+DECLARE_MORE_KERNEL(VSigmoid, XYNTuples);
+DECLARE_MORE_KERNEL(VTanh, XYNTuples);
+
+DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples);
+DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples);
+
+DECLARE_MORE_KERNEL(GRUH1, GRUTuples);
+DECLARE_MORE_KERNEL(GRUHtPart1, GRUTuples);
+DECLARE_MORE_KERNEL(GRUHtPart2, GRUTuples);
+
+#undef DECLARE_MORE_KERNEL
+
+}  // namespace mix
+}  // namespace more
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..863cc720d68ce3dcfe045aa11c559a06a50909f3
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
+set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)
+
+# use mkl kernels by name and type
+USE_JITKERNEL_MORE(kVMul, mkl)
+USE_JITKERNEL_MORE(kVAdd, mkl)
+USE_JITKERNEL_MORE(kVScal, mkl)
+USE_JITKERNEL_MORE(kVExp, mkl)
+USE_JITKERNEL_MORE(kVSigmoid, mkl)
+USE_JITKERNEL_MORE(kVTanh, mkl)
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5b088d4812b8a54e3b4fb1cb83d9e8bc7501994
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/more/mkl/mkl.h"
+#include "paddle/fluid/operators/jit/refer/refer.h"
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/dynload/mklml.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace more {
+namespace mkl {
+
+template <>
+void VMul<float>(const float* x, const float* y, float* z, int n) {
+  platform::dynload::vsMul(n, x, y, z);
+}
+
+template <>
+void VMul<double>(const double* x, const double* y, double* z, int n) {
+  platform::dynload::vdMul(n, x, y, z);
+}
+
+template <>
+void VAdd<float>(const float* x, const float* y, float* z, int n) {
+  platform::dynload::vsAdd(n, x, y, z);
+}
+
+template <>
+void VAdd<double>(const double* x, const double* y, double* z, int n) {
+  platform::dynload::vdAdd(n, x, y, z);
+}
+
+template <>
+void VScal<float>(const float* a, const float* x, float* y, int n) {
+  if (x == y) {
+    platform::dynload::cblas_sscal(n, *a, y, 1);
+  } else {
+    refer::VScal<float>(a, x, y, n);
+  }
+}
+
+template <>
+void VScal<double>(const double* a, const double* x, double* y, int n) {
+  if (x == y) {
+    platform::dynload::cblas_dscal(n, *a, y, 1);
+  } else {
+    refer::VScal<double>(a, x, y, n);
+  }
+}
+
+template <>
+void VExp<float>(const float* x, float* y, int n) {
+  platform::dynload::vsExp(n, x, y);
+}
+
+template <>
+void VExp<double>(const double* x, double* y, int n) {
+  platform::dynload::vdExp(n, x, y);
+}
+
+// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
+template <>
+bool VMulKernel<float>::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx512f) && d > 512;
+}
+
+template <>
+bool VAddKernel<float>::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx512f) && d > 512;
+}
+
+template <>
+bool VScalKernel<float>::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx512f) && d > 512;
+}
+
+template <>
+bool VExpKernel<float>::UseMe(const int& d) const {
+  return d > 7;
+}
+
+template <>
+bool VSigmoidKernel<float>::UseMe(const int& d) const {
+  return d > 7;
+}
+
+template <>
+bool VTanhKernel<float>::UseMe(const int& d) const {
+  return d > 7;
+}
+
+#define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
+  template <>                                            \
+  bool func##Kernel<double>::UseMe(const int& d) const { \
+    return true;                                         \
+  }
+
+AWALYS_USE_ME_WITH_DOUBLE(VMul);
+AWALYS_USE_ME_WITH_DOUBLE(VAdd);
+AWALYS_USE_ME_WITH_DOUBLE(VScal);
+AWALYS_USE_ME_WITH_DOUBLE(VExp);
+AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
+AWALYS_USE_ME_WITH_DOUBLE(VTanh);
+
+#undef AWALYS_USE_ME_WITH_DOUBLE
+}  // namespace mkl
+}  // namespace more
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace mkl = paddle::operators::jit::more::mkl;
+
+#define REGISTER_MKL_KERNEL(key, func)                        \
+  REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel<float>, \
+                          mkl::func##Kernel<double>)
+
+REGISTER_MKL_KERNEL(kVMul, VMul);
+REGISTER_MKL_KERNEL(kVAdd, VAdd);
+REGISTER_MKL_KERNEL(kVScal, VScal);
+REGISTER_MKL_KERNEL(kVExp, VExp);
+REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
+REGISTER_MKL_KERNEL(kVTanh, VTanh);
+
+#undef REGISTER_MKL_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee1031c028ff72181f504004b7cbeb9f7ee578f1
--- /dev/null
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include "paddle/fluid/operators/jit/kernel_base.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace more {
+namespace mkl {
+
+template <typename T>
+void VMul(const T* x, const T* y, T* z, int n);
+
+template <typename T>
+void VAdd(const T* x, const T* y, T* z, int n);
+
+template <typename T>
+void VScal(const T* a, const T* x, T* y, int n);
+
+template <typename T>
+void VExp(const T* x, T* y, int n);
+
+template <typename T>
+void VSigmoid(const T* x, T* y, int n) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  VExp(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+template <typename T>
+void VTanh(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  VSigmoid(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+  }
+}
+
+#define DECLARE_MKL_KERNEL(name, tuples)                             \
+  template <typename T>                                              \
+  class name##Kernel : public KernelMore<tuples<T>> {                \
+   public:                                                           \
+    name##Kernel() { this->func = name<T>; }                         \
+    bool UseMe(const typename tuples<T>::attr_type&) const override; \
+    const char* ImplType() const override { return "MKL"; }          \
+  }
+
+// XYZN
+DECLARE_MKL_KERNEL(VMul, XYZNTuples);
+DECLARE_MKL_KERNEL(VAdd, XYZNTuples);
+
+// AXYN
+DECLARE_MKL_KERNEL(VScal, AXYNTuples);
+
+// XYN
+DECLARE_MKL_KERNEL(VExp, XYNTuples);
+DECLARE_MKL_KERNEL(VSigmoid, XYNTuples);
+DECLARE_MKL_KERNEL(VTanh, XYNTuples);
+
+#undef DECLARE_MKL_KERNEL
+
+}  // namespace mkl
+}  // namespace more
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..07497b732050a7299e224531db37eb56e60ef605
--- /dev/null
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -0,0 +1,28 @@
+
+cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base)
+set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE)
+
+function(USE_JITKERNEL_REFER TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
+endfunction()
+
+# use refer kernel by name
+USE_JITKERNEL_REFER(kVMul)
+USE_JITKERNEL_REFER(kVAdd)
+USE_JITKERNEL_REFER(kVAddRelu)
+USE_JITKERNEL_REFER(kVSub)
+USE_JITKERNEL_REFER(kVScal)
+USE_JITKERNEL_REFER(kVAddBias)
+USE_JITKERNEL_REFER(kVRelu)
+USE_JITKERNEL_REFER(kVIdentity)
+USE_JITKERNEL_REFER(kVExp)
+USE_JITKERNEL_REFER(kVSigmoid)
+USE_JITKERNEL_REFER(kVTanh)
+USE_JITKERNEL_REFER(kLSTMCtHt)
+USE_JITKERNEL_REFER(kLSTMC1H1)
+USE_JITKERNEL_REFER(kGRUH1)
+USE_JITKERNEL_REFER(kGRUHtPart1)
+USE_JITKERNEL_REFER(kGRUHtPart2)
+USE_JITKERNEL_REFER(kCRFDecoding)
+USE_JITKERNEL_REFER(kLayerNorm)
+USE_JITKERNEL_REFER(kNCHW16CMulNC)
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d196266326b4ee668f647fa51032f6344d26e5c6
--- /dev/null
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/refer/refer.h"
+#include "paddle/fluid/operators/jit/registry.h"
+
+namespace refer = paddle::operators::jit::refer;
+
+#define REGISTER_REFER_KERNEL(key, func)                    \
+  REGISTER_JITKERNEL_REFER(key, refer::func##Kernel<float>, \
+                           refer::func##Kernel<double>)
+
+REGISTER_REFER_KERNEL(kVMul, VMul);
+REGISTER_REFER_KERNEL(kVAdd, VAdd);
+REGISTER_REFER_KERNEL(kVAddRelu, VAddRelu);
+REGISTER_REFER_KERNEL(kVSub, VSub);
+
+REGISTER_REFER_KERNEL(kVScal, VScal);
+REGISTER_REFER_KERNEL(kVAddBias, VAddBias);
+
+REGISTER_REFER_KERNEL(kVRelu, VRelu);
+REGISTER_REFER_KERNEL(kVIdentity, VIdentity);
+REGISTER_REFER_KERNEL(kVExp, VExp);
+REGISTER_REFER_KERNEL(kVSigmoid, VSigmoid);
+REGISTER_REFER_KERNEL(kVTanh, VTanh);
+
+REGISTER_REFER_KERNEL(kLSTMCtHt, LSTMCtHt);
+REGISTER_REFER_KERNEL(kLSTMC1H1, LSTMC1H1);
+
+REGISTER_REFER_KERNEL(kGRUH1, GRUH1);
+REGISTER_REFER_KERNEL(kGRUHtPart1, GRUHtPart1);
+REGISTER_REFER_KERNEL(kGRUHtPart2, GRUHtPart2);
+
+REGISTER_REFER_KERNEL(kCRFDecoding, CRFDecoding);
+REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm);
+
+REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC);
+
+#undef REGISTER_REFER_KERNEL
diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/jit/refer/refer.h
similarity index 54%
rename from paddle/fluid/operators/math/jit_kernel_refer.h
rename to paddle/fluid/operators/jit/refer/refer.h
index e0b2e3c7fada6b422318c68a42fd6d103c99af5a..0fd1b89dfdba9f4655f649fa6d32604188c78da3 100644
--- a/paddle/fluid/operators/math/jit_kernel_refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -1,30 +1,31 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
 
 #pragma once
+
 #include <cmath>
-#include <string>
-#include "paddle/fluid/operators/math/jit_kernel_impl.h"
+#include <limits>
+#include "paddle/fluid/operators/jit/helper.h"
+#include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
-namespace math {
-namespace jitkernel {
+namespace jit {
 namespace refer {
-/* Refer code only focus on correctness */
 
+// Refer code only focus on correctness
 template <typename T>
 void VMul(const T* x, const T* y, T* z, int n) {
   for (int i = 0; i < n; ++i) {
@@ -47,6 +48,13 @@ void VAddRelu(const T* x, const T* y, T* z, int n) {
   }
 }
 
+template <typename T>
+void VSub(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] - y[i];
+  }
+}
+
 template <typename T>
 void VScal(const T* a, const T* x, T* y, int n) {
   for (int i = 0; i < n; ++i) {
@@ -69,7 +77,11 @@ void VRelu(const T* x, T* y, int n) {
 }
 
 template <typename T>
-inline void VIdentity(const T* x, T* y, int n) {}
+inline void VIdentity(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i];
+  }
+}
 
 template <typename T>
 void VExp(const T* x, T* y, int n) {
@@ -102,20 +114,22 @@ void VTanh(const T* x, T* y, int n) {
 }
 
 template <typename T>
-void (*getActFunc(const std::string& type))(const T*, T*, int) {  // NOLINT
-  if (type == "sigmoid") {
+void (*getActFunc(KernelType type))(const T*, T*, int) {  // NOLINT
+  if (type == kVSigmoid) {
     return VSigmoid<T>;
-  } else if (type == "relu") {
+  } else if (type == kVRelu) {
     return VRelu<T>;
-  } else if (type == "tanh") {
+  } else if (type == kVTanh) {
     return VTanh<T>;
-  } else if (type == "identity" || type == "") {
+  } else if (type == kVIdentity) {
     return VIdentity<T>;
   }
   PADDLE_THROW("Not support type: %s", type);
   return nullptr;
 }
 
+// TODO(TJ): add refer gemm and make LSTM kernels combine as same GRU kernels
+
 // compute ct and ht
 template <typename T>
 void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
@@ -231,8 +245,134 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
   }
 }
 
+template <typename T>
+void CRFDecoding(const int seq_len, const T* x, const T* w, T* alpha,
+                 int* track, int right) {
+  constexpr int state_trans_base_idx = 2;
+  for (int i = 0; i < right; ++i) {
+    alpha[i] = w[i] + x[i];
+  }
+  for (int k = 1; k < seq_len; ++k) {
+    for (int i = 0; i < right; ++i) {
+      T max_score = -std::numeric_limits<T>::max();
+      int max_j = 0;
+      for (int j = 0; j < right; ++j) {
+        T score = alpha[(k - 1) * right + j] +
+                  w[(j + state_trans_base_idx) * right + i];
+        if (score > max_score) {
+          max_score = score;
+          max_j = j;
+        }
+      }
+      alpha[k * right + i] = max_score + x[k * right + i];
+      track[k * right + i] = max_j;
+    }
+  }
+}
+
+template <typename T>
+void LayerNorm(T* x, T* out, T* mean, T* var, const T* scale, const T* bias,
+               int height, const float epsilon, int right) {
+  // get mean
+  for (int i = 0; i < height; i++) {
+    T sum = 0.0;
+    int offset = i * right;
+    for (int j = 0; j < right; j++) {
+      sum += x[offset + j];
+    }
+    mean[i] = sum / right;
+  }
+
+  // get variance
+  for (int i = 0; i < height; i++) {
+    T sum = 0.0;
+    int offset = i * right;
+    for (int j = 0; j < right; j++) {
+      sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]);
+    }
+    var[i] = sum / right;
+  }
+
+  for (int i = 0; i < height; i++) {
+    int offset = i * right;
+    T sqrt_var = std::sqrt(var[i] + (T)epsilon);
+    for (int j = 0; j < right; j++) {
+      out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var;
+    }
+  }
+  if (scale) {
+    for (int i = 0; i < height; i++) {
+      int offset = i * right;
+      for (int j = 0; j < right; j++) {
+        out[offset + j] *= scale[j];
+      }
+    }
+  }
+
+  if (bias) {
+    for (int i = 0; i < height; i++) {
+      int offset = i * right;
+      for (int j = 0; j < right; j++) {
+        out[offset + j] += bias[j];
+      }
+    }
+  }
+}
+
+template <typename T>
+void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
+  int offset = 0;
+  for (int h = 0; h < height; ++h) {
+    for (int w = 0; w < width; ++w) {
+      for (int i = 0; i < 16; ++i) {
+        z[i + offset] = y[i] * x[i + offset];
+      }
+      offset += ZMM_FLOAT_BLOCK;
+    }
+  }
+}
+
+#define DECLARE_REFER_KERNEL(name, tuples)             \
+  template <typename T>                                \
+  class name##Kernel : public ReferKernel<tuples<T>> { \
+   public:                                             \
+    name##Kernel() { this->func = name<T>; }           \
+  }
+
+// const T* x, const T* y, T* z, int n
+DECLARE_REFER_KERNEL(VMul, XYZNTuples);
+DECLARE_REFER_KERNEL(VAdd, XYZNTuples);
+DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples);
+DECLARE_REFER_KERNEL(VSub, XYZNTuples);
+
+// const T* a, const T* x, T* y, int n
+DECLARE_REFER_KERNEL(VScal, AXYNTuples);
+DECLARE_REFER_KERNEL(VAddBias, AXYNTuples);
+
+// const T* x, T* y, int n
+DECLARE_REFER_KERNEL(VRelu, XYNTuples);
+DECLARE_REFER_KERNEL(VIdentity, XYNTuples);
+DECLARE_REFER_KERNEL(VExp, XYNTuples);
+DECLARE_REFER_KERNEL(VSigmoid, XYNTuples);
+DECLARE_REFER_KERNEL(VTanh, XYNTuples);
+
+// lstm_t*, const lstm_attr_t*
+DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples);
+DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples);
+
+// gru_t*, const gru_attr_t*
+DECLARE_REFER_KERNEL(GRUH1, GRUTuples);
+DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples);
+DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples);
+
+DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples);
+DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples);
+
+DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples);
+
+#undef DECLARE_REFER_KERNEL
+
 }  // namespace refer
-}  // namespace jitkernel
-}  // namespace math
+}  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb32c487208fe8fe9e72c069db8833c736316aec
--- /dev/null
+++ b/paddle/fluid/operators/jit/registry.h
@@ -0,0 +1,167 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/operators/jit/kernel_pool.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/variant.h"  // for UNUSED
+
+namespace paddle {
+namespace operators {
+namespace jit {
+
+// make_unique is supported since c++14
+template <typename T, typename... Args>
+inline std::unique_ptr<T> make_unique(Args&&... args) {
+  static_assert(!std::is_array<T>::value, "T must not be array");
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <typename Pool, typename PlaceType, bool IsEnd, size_t I,
+          typename... KernelImpls>
+struct JitKernelRegistrarFunctor;
+
+template <typename Pool, typename PlaceType, size_t I, typename... KernelImpls>
+struct JitKernelRegistrarFunctor<Pool, PlaceType, true, I, KernelImpls...> {
+  void operator()(KernelType kt) const {}
+};
+
+template <typename Pool, typename PlaceType, size_t I, typename... KernelImpls>
+struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
+  using KERNEL_IMPL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelImpls...>>::type;
+
+  void operator()(KernelType kt) const {
+    KernelKey kkey(kt, PlaceType());
+    Pool().Instance().Insert(kkey,
+                             std::move(make_unique<const KERNEL_IMPL_TYPE>()));
+    constexpr auto size = std::tuple_size<std::tuple<KernelImpls...>>::value;
+    JitKernelRegistrarFunctor<Pool, PlaceType, I + 1 == size, I + 1,
+                              KernelImpls...>
+        func;
+    func(kt);
+  }
+};
+
+template <typename Pool, typename PlaceType, typename... KernelImpls>
+class JitKernelRegistrar {
+ public:
+  explicit JitKernelRegistrar(KernelType kt) {
+    JitKernelRegistrarFunctor<Pool, PlaceType, false, 0, KernelImpls...> func;
+    func(kt);
+  }
+  void Touch() {}
+};
+
+#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+// Refer always on CPUPlace
+#define REGISTER_JITKERNEL_REFER(kernel_type, ...)                             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                    \
+      __reg_jitkernel_##kernel_type##_refer_CPUPlace,                          \
+      "REGISTER_KERNEL_REFER must be called in global namespace");             \
+  static ::paddle::operators::jit::JitKernelRegistrar<                         \
+      ::paddle::operators::jit::ReferKernelPool, ::paddle::platform::CPUPlace, \
+      __VA_ARGS__>                                                             \
+      __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_(                  \
+          ::paddle::operators::jit::KernelType::kernel_type);                  \
+  int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {                    \
+    __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch();            \
+    return 0;                                                                  \
+  }
+
+// kernel_type: should be in paddle::operators::jit::KernelType
+// place_type: should be one of CPUPlace and GPUPlace in paddle::platform
+#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...)         \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                   \
+      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type,             \
+      "REGISTER_KERNEL_MORE must be called in global namespace");             \
+  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();             \
+  static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \
+      UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
+  static ::paddle::operators::jit::JitKernelRegistrar<                        \
+      ::paddle::operators::jit::KernelPool, ::paddle::platform::place_type,   \
+      __VA_ARGS__>                                                            \
+      __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_(   \
+          ::paddle::operators::jit::KernelType::kernel_type);                 \
+  int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() {     \
+    __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_      \
+        .Touch();                                                             \
+    return 0;                                                                 \
+  }
+
+#define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \
+  REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
+
+#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \
+  REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
+
+#define REGISTER_JITKERNEL_GEN(kernel_type, ...)                    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
+      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,                \
+      "REGISTER_JITKERNEL_GEN must be called in global namespace"); \
+  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
+  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =       \
+      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();          \
+  static ::paddle::operators::jit::JitKernelRegistrar<              \
+      ::paddle::operators::jit::JitCodeCreatorPool,                 \
+      ::paddle::platform::CPUPlace, __VA_ARGS__>                    \
+      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(         \
+          ::paddle::operators::jit::KernelType::kernel_type);       \
+  int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {           \
+    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();   \
+    return 0;                                                       \
+  }
+
+#define USE_JITKERNEL_GEN(kernel_type)                            \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                       \
+      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,              \
+      "USE_JITKERNEL_GEN must be called in global namespace");    \
+  extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
+  static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
+      TouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
+
+#define USE_JITKERNEL_REFER(kernel_type)                            \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
+      __reg_jitkernel_##kernel_type##_refer_CPUPlace_,              \
+      "USE_JITKERNEL_REFER must be called in global namespace");    \
+  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
+  static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
+      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
+
+#define USE_KERNEL_MORE(kernel_type, impl_type, place_type)              \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                              \
+      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
+      "USE_JITKERNEL_MORE must be called in global namespace");          \
+  extern int                                                             \
+      TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
+  static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \
+      UNUSED =                                                           \
+          TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
+
+#define USE_JITKERNEL_MORE(kernel_type, impl_type) \
+  USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace)
+
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a73e2a60aeb0c1594b5072b2bffbd11cccfcdc7d
--- /dev/null
+++ b/paddle/fluid/operators/jit/test.cc
@@ -0,0 +1,584 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include <random>
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+template <typename T>
+void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
+               const T upper = static_cast<T>(20.f)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+template <typename T>
+void ExpectEQ(const T* target, const T* refer, int n) {
+  if (std::is_floating_point<T>::value) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(target[i], refer[i], 1e-5);
+    }
+  } else {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_EQ(target[i], refer[i]);
+    }
+  }
+}
+
+std::vector<int> TestSizes() {
+  std::vector<int> s;
+  for (int i = 1; i < 32; ++i) {
+    s.push_back(i);
+  }
+  // test some large size
+  s.push_back(100);
+  s.push_back(1000);
+  s.push_back(2000);
+  return s;
+}
+
+namespace jit = paddle::operators::jit;
+
+template <typename KernelTuples, typename... Args>
+struct TestFuncWithRefer {
+  void operator()(const typename KernelTuples::func_type tgt, Args... args) {}
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::XYZNTuples<T>, std::vector<T>, std::vector<T>,
+                         std::vector<T>> {
+  void operator()(const typename jit::XYZNTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const std::vector<T>& y,
+                  const std::vector<T>& zref) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(zref.size(), x.size());
+    EXPECT_EQ(zref.size(), y.size());
+    const T* x_data = x.data();
+    const T* y_data = y.data();
+    const T* zref_data = zref.data();
+    const int d = zref.size();
+
+    std::vector<T> ztgt(d);
+    T* ztgt_data = ztgt.data();
+    // test normal
+    tgt(x_data, y_data, ztgt_data, d);
+    ExpectEQ<T>(ztgt_data, zref_data, d);
+    // test inplace x
+    std::copy(x.begin(), x.end(), ztgt.begin());
+    tgt(ztgt_data, y_data, ztgt_data, d);
+    ExpectEQ<T>(ztgt_data, zref_data, d);
+    // test inplace y
+    std::copy(y.begin(), y.end(), ztgt.begin());
+    tgt(x_data, ztgt_data, ztgt_data, d);
+    ExpectEQ<T>(ztgt_data, zref_data, d);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
+                         std::vector<T>> {
+  void operator()(const typename jit::AXYNTuples<T>::func_type tgt, const T a,
+                  const std::vector<T>& x, const std::vector<T>& yref) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(yref.size(), x.size());
+    const T* x_data = x.data();
+    const T* yref_data = yref.data();
+    const int d = yref.size();
+    std::vector<T> ytgt(d);
+    T* ytgt_data = ytgt.data();
+    // test normal
+    tgt(&a, x_data, ytgt_data, d);
+    ExpectEQ<T>(ytgt_data, yref_data, d);
+    // test inplace x
+    std::copy(x.begin(), x.end(), ytgt.begin());
+    tgt(&a, ytgt_data, ytgt_data, d);
+    ExpectEQ<T>(ytgt_data, yref_data, d);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
+  void operator()(const typename jit::XYNTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const std::vector<T>& yref) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(yref.size(), x.size());
+    const T* x_data = x.data();
+    const T* yref_data = yref.data();
+    const int d = yref.size();
+    std::vector<T> ytgt(d);
+    T* ytgt_data = ytgt.data();
+    // test normal
+    tgt(x_data, ytgt_data, d);
+    ExpectEQ<T>(ytgt_data, yref_data, d);
+    // test inplace x
+    std::copy(x.begin(), x.end(), ytgt.begin());
+    tgt(ytgt_data, ytgt_data, d);
+    ExpectEQ<T>(ytgt_data, yref_data, d);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
+                         std::vector<T>, std::vector<T>, std::vector<T>> {
+  void operator()(const typename jit::LSTMTuples<T>::func_type tgt,
+                  const std::vector<T>& xsrc, const std::vector<T>& wp,
+                  const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
+                  const std::vector<T>& ht_ref,
+                  const typename jit::LSTMTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(ct_ref.size(), ht_ref.size());
+    EXPECT_EQ(ct_1.size(), ht_ref.size());
+    EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
+    EXPECT_EQ(wp.size(), 3 * ht_ref.size());
+
+    // x could be changed after compute, so copy to save src
+    int d = ht_ref.size();
+    std::vector<T> x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size());
+    std::vector<T> checked(2 * d);
+    std::copy(xsrc.begin(), xsrc.end(), x.begin());
+
+    const T* ct_1_data = ct_1.data();
+    const T* wp_data = wp.data();
+    const T* ct_ref_data = ct_ref.data();
+    const T* ht_ref_data = ht_ref.data();
+    T* x_data = x.data();
+    T* ct_data = ct.data();
+    T* ht_data = ht.data();
+    T* checked_data = checked.data();
+
+    paddle::operators::jit::lstm_t step;
+    step.gates = x_data;
+    step.ct_1 = ct_1_data;
+    step.ct = ct_data;
+    step.ht = ht_data;
+    if (attr.use_peephole) {
+      step.wp = wp_data;
+      step.checked = checked_data;
+    }
+
+    tgt(&step, &attr);
+    ExpectEQ<T>(ct_data, ct_ref_data, d);
+    ExpectEQ<T>(ht_data, ht_ref_data, d);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
+                         std::vector<T>> {
+  void operator()(const typename jit::GRUTuples<T>::func_type tgt,
+                  const std::vector<T>& xsrc, const std::vector<T>& ht_1,
+                  const std::vector<T>& ht_ref,
+                  const typename jit::GRUTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(ht_1.size(), ht_ref.size());
+    EXPECT_EQ(xsrc.size(), 3 * ht_ref.size());
+
+    // x could be changed after compute, so copy to save src
+    int d = ht_ref.size();
+    std::vector<T> x(xsrc.size()), ht(ht_ref.size());
+    std::copy(xsrc.begin(), xsrc.end(), x.begin());
+    const T* ht_1_data = ht_1.data();
+    const T* ht_ref_data = ht_ref.data();
+    T* x_data = x.data();
+    T* ht_data = ht.data();
+    paddle::operators::jit::gru_t step;
+    step.gates = x_data;
+    step.ht_1 = ht_1_data;
+    step.ht = ht_data;
+    tgt(&step, &attr);
+    ExpectEQ<T>(ht_data, ht_ref_data, d);
+  }
+};
+
+template <paddle::operators::jit::KernelType KT, typename KernelTuples,
+          typename PlaceType, typename... Args>
+void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
+  TestFuncWithRefer<KernelTuples, Args...> test;
+  // test jitcode
+  auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
+  if (jitcode) {
+    VLOG(10) << "Test Jitcode Kernel ";
+    test(jitcode, args...);
+  }
+  // test all impls in more
+  jit::KernelKey kkey(KT, PlaceType());
+  auto& pool = jit::KernelPool().Instance().AllKernels();
+  auto iter = pool.find(kkey);
+  if (iter != pool.end()) {
+    auto& impls = iter->second;
+    for (auto& impl : impls) {
+      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
+      if (i && i->UseMe(attr)) {
+        auto more = i->GetFunc();
+        VLOG(10) << "Test More Kernel : " << i->ImplType();
+        test(more, args...);
+      }
+    }
+  }
+  // test result from Get function
+  // VLOG(10) << "Test Get function ";
+  auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
+  test(tgt, args...);
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestXYZNKernel() {
+  namespace jit = paddle::operators::jit;
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  for (int d : TestSizes()) {
+    auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
+    EXPECT_TRUE(ref != nullptr);
+
+    std::vector<T> x(d), y(d), zref(d);
+    RandomVec<T>(d, x.data());
+    RandomVec<T>(d, y.data());
+
+    std::vector<T> xinp(d), yinp(d);  // inplace test
+    std::copy(x.begin(), x.end(), xinp.begin());
+    std::copy(y.begin(), y.end(), yinp.begin());
+
+    const T* x_data = x.data();
+    const T* y_data = y.data();
+    T* zref_data = zref.data();
+    T* xinp_data = xinp.data();
+    T* yinp_data = yinp.data();
+
+    // test refer code inplace
+    ref(x_data, y_data, zref_data, d);
+    ref(x_data, yinp_data, yinp_data, d);
+    ref(xinp_data, y_data, xinp_data, d);
+    ExpectEQ<T>(xinp_data, zref_data, d);
+    ExpectEQ<T>(yinp_data, zref_data, d);
+
+    TestAllImpls<KT, jit::XYZNTuples<T>, PlaceType, std::vector<T>,
+                 std::vector<T>, std::vector<T>>(d, x, y, zref);
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestAXYNKernel() {
+  namespace jit = paddle::operators::jit;
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  for (int d : TestSizes()) {
+    auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
+    EXPECT_TRUE(ref != nullptr);
+
+    const T a = static_cast<T>(3);
+    std::vector<T> x(d), yref(d);
+    std::vector<T> xinp(d);  // inplace test
+    RandomVec<T>(d, x.data());
+    std::copy(x.begin(), x.end(), xinp.begin());
+
+    const T* x_data = x.data();
+    T* yref_data = yref.data();
+    T* xinp_data = xinp.data();
+    // test refer code inplace
+    ref(&a, x_data, yref_data, d);
+    ref(&a, xinp_data, xinp_data, d);
+    ExpectEQ<T>(xinp_data, yref_data, d);
+
+    TestAllImpls<KT, jit::AXYNTuples<T>, PlaceType, T, std::vector<T>,
+                 std::vector<T>>(d, a, x, yref);
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestXYNKernel() {
+  namespace jit = paddle::operators::jit;
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  for (int d : TestSizes()) {
+    auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
+    EXPECT_TRUE(ref != nullptr);
+
+    std::vector<T> x(d), yref(d);
+    std::vector<T> xinp(d);  // inplace test
+    RandomVec<T>(d, x.data(), -2.f, 2.f);
+    std::copy(x.begin(), x.end(), xinp.begin());
+
+    const T* x_data = x.data();
+    T* yref_data = yref.data();
+    T* xinp_data = xinp.data();
+    // test refer code inplace
+    ref(x_data, yref_data, d);
+    ref(xinp_data, xinp_data, d);
+    ExpectEQ<T>(xinp_data, yref_data, d);
+
+    TestAllImpls<KT, jit::XYNTuples<T>, PlaceType, std::vector<T>,
+                 std::vector<T>>(d, x, yref);
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestLSTMKernel() {
+  namespace jit = paddle::operators::jit;
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
+  for (int d : TestSizes()) {
+    for (bool use_peephole : {true, false}) {
+      for (auto& act_gate : all_acts) {
+        for (auto& act_cand : all_acts) {
+          for (auto& act_cell : all_acts) {
+            const jit::lstm_attr_t attr(
+                d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand),
+                jit::to_kerneltype(act_cell), use_peephole);
+            auto ref = jit::GetRefer<KT, jit::LSTMTuples<T>>();
+            EXPECT_TRUE(ref != nullptr);
+            std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
+            std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
+            RandomVec<T>(4 * d, xsrc.data(), -2.f, 2.f);
+            RandomVec<T>(3 * d, wp.data(), -2.f, 2.f);
+            RandomVec<T>(d, ct_1.data(), -2.f, 2.f);
+            // x could be changed after compute, so copy to save src
+            std::vector<T> x(xsrc.size());
+            std::copy(xsrc.begin(), xsrc.end(), x.begin());
+            const T* ct_1_data = ct_1.data();
+            const T* wp_data = wp.data();
+            T* x_data = x.data();
+            T* checked_data = checked.data();
+            T* ct_ref_data = ct_ref.data();
+            T* ht_ref_data = ht_ref.data();
+            jit::lstm_t step;
+            step.gates = x_data;
+            step.ct_1 = ct_1_data;
+            step.ct = ct_ref_data;
+            step.ht = ht_ref_data;
+            if (use_peephole) {
+              step.wp = wp_data;
+              step.checked = checked_data;
+            }
+            ref(&step, &attr);
+            VLOG(10) << attr;
+            TestAllImpls<KT, jit::LSTMTuples<T>, PlaceType, std::vector<T>,
+                         std::vector<T>, std::vector<T>, std::vector<T>,
+                         std::vector<T>>(attr, xsrc, wp, ct_1, ct_ref, ht_ref,
+                                         attr);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestGRUKernel() {
+  namespace jit = paddle::operators::jit;
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
+  for (int d : TestSizes()) {
+    for (auto& act_gate : all_acts) {
+      for (auto& act_cand : all_acts) {
+        const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
+                                   jit::to_kerneltype(act_cand));
+        auto ref = jit::GetRefer<KT, jit::GRUTuples<T>>();
+        EXPECT_TRUE(ref != nullptr);
+        std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
+        RandomVec<T>(3 * d, xsrc.data(), -2.f, 2.f);
+        RandomVec<T>(d, ht_1.data(), -2.f, 2.f);
+        // x could be changed after compute, so copy to save src
+        std::vector<T> x(xsrc.size());
+        std::copy(xsrc.begin(), xsrc.end(), x.begin());
+        const T* ht_1_data = ht_1.data();
+        T* x_data = x.data();
+        T* ht_ref_data = ht_ref.data();
+        jit::gru_t step;
+        step.gates = x_data;
+        step.ht_1 = ht_1_data;
+        step.ht = ht_ref_data;
+        ref(&step, &attr);
+        VLOG(10) << attr;
+        TestAllImpls<KT, jit::GRUTuples<T>, PlaceType, std::vector<T>,
+                     std::vector<T>, std::vector<T>>(attr, xsrc, ht_1, ht_ref,
+                                                     attr);
+      }
+    }
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestNCHW16CMulNCKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  const int n = 3, c = 16 * 4, h = 10, w = 10;
+  auto ref = jit::GetRefer<KT, jit::NCHW16CMulNCTuples<T>>();
+  EXPECT_TRUE(ref != nullptr);
+  int sz = n * c * h * w;
+  std::vector<T> x(sz), y(n * c), zref(sz);
+  std::vector<T> ztgt(sz), zjit(sz);
+  RandomVec<T>(sz, x.data(), -2.f, 2.f);
+  RandomVec<T>(n * c, y.data(), -2.f, 2.f);
+
+  const T* x_data = x.data();
+  const T* y_data = y.data();
+  T* zref_data = zref.data();
+  T* ztgt_data = ztgt.data();
+  T* zjit_data = zjit.data();
+  constexpr int simd_width = ZMM_FLOAT_BLOCK;
+  int C = c / simd_width;
+  auto tgt = jit::Get<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
+  auto jitcode = jit::GetJitCode<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
+  EXPECT_TRUE(tgt != nullptr);
+
+  if (std::is_same<T, float>::value &&
+      paddle::platform::MayIUse(paddle::platform::avx512f)) {
+    EXPECT_TRUE(jitcode != nullptr);
+  }
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < C; ci++) {
+      auto ptr_x =
+          x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+      auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
+      auto ptr_zref =
+          zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+      auto ptr_ztgt =
+          ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+
+      ref(ptr_x, ptr_y, ptr_zref, h, w);
+      tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
+
+      if (jitcode) {
+        auto ptr_zjit =
+            zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+        jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
+      }
+    }
+  }
+  ExpectEQ<T>(ztgt_data, zref_data, sz);
+  if (jitcode) {
+    ExpectEQ<T>(zjit_data, zref_data, sz);
+  }
+}
+
+// XYZNTuple
+TEST(JITKernel, kVMul) {
+  namespace jit = paddle::operators::jit;
+  TestXYZNKernel<jit::kVMul, float, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVMul, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kVAdd) {
+  namespace jit = paddle::operators::jit;
+  TestXYZNKernel<jit::kVAdd, float, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVAdd, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kVAddRelu) {
+  namespace jit = paddle::operators::jit;
+  TestXYZNKernel<jit::kVAddRelu, float, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVAddRelu, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kVSub) {
+  namespace jit = paddle::operators::jit;
+  TestXYZNKernel<jit::kVSub, float, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVSub, double, paddle::platform::CPUPlace>();
+}
+
+// AXYNTuples
+TEST(JITKernel, kVScal) {
+  namespace jit = paddle::operators::jit;
+  TestAXYNKernel<jit::kVScal, float, paddle::platform::CPUPlace>();
+  TestAXYNKernel<jit::kVScal, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kVAddBias) {
+  namespace jit = paddle::operators::jit;
+  TestAXYNKernel<jit::kVAddBias, float, paddle::platform::CPUPlace>();
+  TestAXYNKernel<jit::kVAddBias, double, paddle::platform::CPUPlace>();
+}
+
+// XYNTuples
+TEST(JITKernel, kVRelu) {
+  namespace jit = paddle::operators::jit;
+  TestXYNKernel<jit::kVRelu, float, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVRelu, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kVIdentity) {
+  namespace jit = paddle::operators::jit;
+  TestXYNKernel<jit::kVIdentity, float, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVIdentity, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kVExp) {
+  namespace jit = paddle::operators::jit;
+  TestXYNKernel<jit::kVExp, float, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVExp, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kVSigmoid) {
+  namespace jit = paddle::operators::jit;
+  TestXYNKernel<jit::kVSigmoid, float, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVSigmoid, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kVTanh) {
+  namespace jit = paddle::operators::jit;
+  TestXYNKernel<jit::kVTanh, float, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVTanh, double, paddle::platform::CPUPlace>();
+}
+
+// LSTM
+TEST(JITKernel, kLSTMCtHt) {
+  namespace jit = paddle::operators::jit;
+  TestLSTMKernel<jit::kLSTMCtHt, float, paddle::platform::CPUPlace>();
+  TestLSTMKernel<jit::kLSTMCtHt, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kLSTMC1H1) {
+  namespace jit = paddle::operators::jit;
+  TestLSTMKernel<jit::kLSTMC1H1, float, paddle::platform::CPUPlace>();
+  TestLSTMKernel<jit::kLSTMC1H1, double, paddle::platform::CPUPlace>();
+}
+
+// GRU
+TEST(JITKernel, kGRUH1) {
+  namespace jit = paddle::operators::jit;
+  TestGRUKernel<jit::kGRUH1, float, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUH1, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kGRUHtPart1) {
+  namespace jit = paddle::operators::jit;
+  TestGRUKernel<jit::kGRUHtPart1, float, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart1, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kGRUHtPart2) {
+  namespace jit = paddle::operators::jit;
+  TestGRUKernel<jit::kGRUHtPart2, float, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart2, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kNCHW16CMulNC) {
+  namespace jit = paddle::operators::jit;
+  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float,
+                         paddle::platform::CPUPlace>();
+  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double,
+                         paddle::platform::CPUPlace>();
+}
+
+// TODO(yihua/TJ): add crf decoding and layer norm unit tests
+
+TEST(JITKernel, pool) {
+  // TODO(TJ): add some test
+}
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index 78d20ddf5fd63b81fd5e7fba656d825897a67a11..f564a103963bd93732165596712230b0f37f7f26 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__)
-#include "paddle/fluid/operators/math/jit_kernel.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 #endif
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -229,12 +229,12 @@ class LayerNormKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(scale->numel(), right);
     PADDLE_ENFORCE_EQ(bias->numel(), right);
 
-    const auto& ker = math::jitkernel::KernelPool::Instance()
-                          .template Get<math::jitkernel::LayerNormKernel<T>>(
-                              static_cast<int>(right));
-    ker->Compute(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
-                 scale->data<T>(), bias->data<T>(), static_cast<int>(left),
-                 static_cast<const float>(epsilon));
+    auto ker =
+        jit::Get<jit::kLayerNorm, jit::LayerNormTuples<T>, platform::CPUPlace>(
+            right);
+    ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
+        scale->data<T>(), bias->data<T>(), static_cast<int>(left),
+        static_cast<const float>(epsilon), right);
 #endif
   }
 };
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index b3d2ea38eb1bfffadc1f68c5a34bc4d557bdea3b..ea6aebd291eee580a307aa112117434fa942005e 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -73,12 +73,3 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-
-set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
-set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-if(WITH_XBYAK)
-    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
-    list(APPEND JIT_KERNEL_DEPS xbyak)
-endif()
-cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 760a065c1081d1e55901774b258ba524471b856b..e925e7bb5917c9433c3c79b9a21a41b4d48a5ba0 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -131,9 +131,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int in_col = input[0].numel() / in_row;
     int out_row = in_row, out_col = 0;
 
-    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_col(in_num + 1);
-    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+    std::vector<const T*> inputs_data;
+    std::vector<int> inputs_col(in_num + 1);
+    inputs_data.reserve(in_num);
 
     inputs_col[0] = 0;
     bool sameShape = true;
@@ -144,12 +144,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
       }
       out_col += t_cols;
       inputs_col[i + 1] = out_col;
-      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
+      inputs_data.emplace_back(input[i].data<T>());
     }
 
-    T** dev_ins_data =
-        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-
     // computation
     // set the thread block and grid according to CurrentDeviceId
     const int kThreadsPerBlock = 1024;
@@ -169,18 +166,32 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
         std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
+    auto tmp_dev_ins_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            inputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_ins_data->ptr(), platform::CPUPlace(),
+                 static_cast<void*>(inputs_data.data()),
+                 inputs_data.size() * sizeof(T*), context.stream());
+    T** dev_ins_data = reinterpret_cast<T**>(tmp_dev_ins_data->ptr());
+
     if (sameShape) {
       ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
-      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
+      auto tmp_dev_ins_col_data =
+          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+              inputs_col.size() * sizeof(int));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                   static_cast<void*>(inputs_col.data()),
+                   inputs_col.size() * sizeof(int), context.stream());
+      int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data->ptr());
+
       ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
           out_row, out_col, output->data<T>());
     }
-    // Wait() must be called because `inputs_data` may be destructed before
-    // kernel ends
-    context.Wait();
   }
 };
 
@@ -207,9 +218,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     int in_col = 0, in_row = out_row;
     bool sameShape = true;
 
-    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(o_num + 1);
-    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
+    std::vector<T*> outputs_data(o_num);
+    std::vector<int> outputs_cols(o_num + 1);
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
@@ -220,15 +230,12 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       in_col += t_col;
       outputs_cols[i + 1] = in_col;
       if (outputs->at(i) != nullptr) {
-        outputs_ptr[i] = outputs->at(i)->data<T>();
+        outputs_data[i] = outputs->at(i)->data<T>();
       } else {
-        outputs_ptr[i] = nullptr;
+        outputs_data[i] = nullptr;
       }
     }
 
-    T** dev_out_gpu_data =
-        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-
     // computation
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
@@ -247,18 +254,33 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
         std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
+    auto tmp_dev_outs_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            outputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_outs_data->ptr(), platform::CPUPlace(),
+                 reinterpret_cast<void*>(outputs_data.data()),
+                 outputs_data.size() * sizeof(T*), context.stream());
+    T** dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+
     if (sameShape) {
       SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
-      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
+      auto tmp_dev_ins_col_data =
+          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+              outputs_cols.size() * sizeof(int));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                   reinterpret_cast<void*>(outputs_cols.data()),
+                   outputs_cols.size() * sizeof(int), context.stream());
+      int* dev_outs_col_data =
+          reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
+
       SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
           static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
-    // Wait() must be called because `outputs_data` may be destructed before
-    // kernel ends
-    context.Wait();
   }
 };
 
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index ccbd05c82ad6a880d21269092088be9656b35c99..2e3779ff0845294e71f27801049c010e0a585e6b 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 
+#if defined(_WIN32)
+#if defined(__AVX2__) || defined(__AVX__)
+inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
+#endif
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index 5b9953a5aa9a29bd917d16a16c678fc32a32c18f..cddd0a18db53a7ddf9ca14d5f373180586ef6a31 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -30,22 +30,21 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
     return;
   }
   if (relu) {
-    const auto& vaddrelu = jitkernel::KernelPool::Instance()
-                               .template Get<jitkernel::VAddReluKernel<T>>(N);
+    auto compute =
+        jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(N);
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
-      vaddrelu->Compute(B, dst, dst, N);
+      compute(B, dst, dst, N);
     }
   } else {
-    const auto& vadd = jitkernel::KernelPool::Instance()
-                           .template Get<jitkernel::VAddKernel<T>>(N);
-
+    auto compute =
+        jit::Get<jit::kVAdd, jit::XYZNTuples<T>, platform::CPUPlace>(N);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
-      vadd->Compute(B, dst, dst, N);
+      compute(B, dst, dst, N);
     }
   }
 }
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
deleted file mode 100644
index 2b08c1059713fb9acd0cfdcf39ac2ad283172724..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_code.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_code.h"
-#include <stddef.h>                                  // offsetof
-#include "paddle/fluid/operators/math/jit_kernel.h"  // TODO(TJ): remove me
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-namespace gen {
-
-using namespace platform;  // NOLINT
-
-bool VXXJitCode::init(int d, int scalar_index) {
-  // It's not necessary to use avx512 since it would slow down the frequency
-  // and this kernel is not compute bound.
-  return MayIUse(avx) && scalar_index >= 0 && scalar_index <= 2;
-}
-
-void VXXJitCode::generate() {
-  // do not need push stack, and do not need save avx512reg if do not use avx512
-  int offset = 0;
-  if (with_relu_) {
-    vxorps(ymm_zero, ymm_zero, ymm_zero);
-  }
-  if (scalar_index_ == 1) {
-    vbroadcastss(ymm_src1, ptr[param1]);
-  } else if (scalar_index_ == 2) {
-    vbroadcastss(ymm_src2, ptr[param2]);
-  }
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    if (scalar_index_ != 1) {
-      vmovups(ymm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovups(ymm_src2, ptr[param2 + offset]);
-    }
-    if (type_ == operand_type::mul) {
-      vmulps(ymm_dst, ymm_src1, ymm_src2);
-    } else if (type_ == operand_type::add) {
-      vaddps(ymm_dst, ymm_src1, ymm_src2);
-    }
-    if (with_relu_) {
-      vmaxps(ymm_dst, ymm_zero, ymm_dst);
-    }
-    vmovups(ptr[param3 + offset], ymm_dst);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-  int rest = num_ % YMM_FLOAT_BLOCK;
-  while (rest > 0) {
-    int block = XMM_FLOAT_BLOCK;
-    if (rest >= 4) {
-      block = 4;
-      if (scalar_index_ != 1) {
-        vmovups(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovups(xmm_src2, ptr[param2 + offset]);
-      }
-    } else if (rest >= 2) {
-      block = 2;
-      if (scalar_index_ != 1) {
-        vmovq(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovq(xmm_src2, ptr[param2 + offset]);
-      }
-    } else {
-      block = 1;
-      if (scalar_index_ != 1) {
-        vmovss(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovss(xmm_src2, ptr[param2 + offset]);
-      }
-    }
-    switch (type_) {
-      case operand_type::mul:
-        vmulps(xmm_dst, xmm_src1, xmm_src2);
-        break;
-      case operand_type::add:
-        vaddps(xmm_dst, xmm_src1, xmm_src2);
-        break;
-      default:
-        break;
-    }
-    if (with_relu_) {
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
-    }
-    if (rest >= 4) {
-      vmovups(ptr[param3 + offset], xmm_dst);
-    } else if (rest >= 2) {
-      vmovq(ptr[param3 + offset], xmm_dst);
-    } else {
-      vmovss(ptr[param3 + offset], xmm_dst);
-    }
-    offset += sizeof(float) * block;
-    rest -= block;
-  }
-  ret();
-}
-
-const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
-    REPEAT_8TIMES(1.f),
-    REPEAT_8TIMES(2.f),
-    REPEAT_8TIMES(0.5f),
-    REPEAT_8TIMES(EXP_HIG),
-    REPEAT_8TIMES(EXP_LOW),
-    REPEAT_8TIMES(CEPHES_LOG2EF),
-    REPEAT_8TIMES(CEPHES_EXP_C1),
-    REPEAT_8TIMES(CEPHES_EXP_C2),
-    REPEAT_8TIMES(CEPHES_EXP_P0),
-    REPEAT_8TIMES(CEPHES_EXP_P1),
-    REPEAT_8TIMES(CEPHES_EXP_P2),
-    REPEAT_8TIMES(CEPHES_EXP_P3),
-    REPEAT_8TIMES(CEPHES_EXP_P4),
-    REPEAT_8TIMES(CEPHES_EXP_P5),
-    REPEAT_8TIMES(EXP_MAX_INPUT),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
-
-const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
-int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
-
-bool VActJitCode::init(int d, operand_type type) {
-  // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
-  return MayIUse(avx);
-}
-
-void VActJitCode::generate() {
-  int offset = 0;
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    vmovups(ymm_src, ptr[param1 + offset]);
-    act<ymm_t>(ymm_dst, ymm_src, type_);
-    vmovups(ptr[param2 + offset], ymm_dst);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-  int rest = num_ % YMM_FLOAT_BLOCK;
-  while (rest > 0) {
-    int block = XMM_FLOAT_BLOCK;
-    if (rest >= 4) {
-      block = 4;
-      vmovups(xmm_src, ptr[param1 + offset]);
-    } else if (rest >= 2) {
-      block = 2;
-      vmovq(xmm_src, ptr[param1 + offset]);
-    } else {
-      block = 1;
-      vmovss(xmm_src, ptr[param1 + offset]);
-    }
-    act<xmm_t>(xmm_dst, xmm_src, type_);
-    if (rest >= 4) {
-      vmovups(ptr[param2 + offset], xmm_dst);
-    } else if (rest >= 2) {
-      vmovq(ptr[param2 + offset], xmm_dst);
-    } else {
-      vmovss(ptr[param2 + offset], xmm_dst);
-    }
-    offset += sizeof(float) * block;
-    rest -= block;
-  }
-  ret();
-}
-
-bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
-
-void LSTMJitCode::generate() {
-  if (use_peephole_) {
-    preCode();
-  }
-  reg64_t reg_ptr_gates = rax;
-  reg64_t reg_ptr_ct_1 = r9;
-  reg64_t reg_ptr_ct = r10;
-  reg64_t reg_ptr_ht = r11;
-  reg64_t reg_ptr_wp = r12;
-  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
-  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
-  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
-  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
-  if (use_peephole_) {
-    mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]);
-  }
-
-  int offset = 0;
-  int d = num_ * sizeof(float);
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    /* gates: W_ch, W_ih, W_fh, W_oh */
-    ymm_t ymm_c = ymm_t(0);
-    ymm_t ymm_i = ymm_t(1);
-    ymm_t ymm_f = ymm_t(2);
-    ymm_t ymm_o = ymm_t(3);
-    ymm_t ymm_ct_1 = ymm_t(4);
-    ymm_t ymm_wp0 = ymm_t(5);
-    ymm_t ymm_wp1 = ymm_t(6);
-    ymm_t ymm_wp2 = ymm_t(7);
-    vmovups(ymm_c, ptr[reg_ptr_gates + offset]);
-    vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]);
-    vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]);
-    vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]);
-    if (!compute_c1h1_) {
-      vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
-    }
-    if (use_peephole_) {
-      vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]);
-      vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]);
-      vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]);
-    }
-    /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
-    // act_cand(c)
-    act<ymm_t>(ymm_c, ymm_c, act_cand_);
-    // act_gate(i) or act_gate(ct_1 * wp0 + i)
-    if (!compute_c1h1_ && use_peephole_) {
-      vmulps(ymm_wp0, ymm_ct_1, ymm_wp0);
-      vaddps(ymm_i, ymm_i, ymm_wp0);
-    }
-    act<ymm_t>(ymm_i, ymm_i, act_gate_);
-    vmulps(ymm_c, ymm_c, ymm_i);
-    if (!compute_c1h1_) {
-      // act_gate(f) or act_gate(ct_1 * wp1 + f)
-      if (use_peephole_) {
-        vmulps(ymm_wp1, ymm_ct_1, ymm_wp1);
-        vaddps(ymm_f, ymm_f, ymm_wp1);
-      }
-      act<ymm_t>(ymm_f, ymm_f, act_gate_);
-      // ct
-      vmulps(ymm_f, ymm_f, ymm_ct_1);
-      vaddps(ymm_f, ymm_f, ymm_c);
-    }
-    /* H_t = act_cell(C_t) * act_gate(o) */
-    // act_cell(C_t)
-    ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
-    ymm_t ymm_tmp = ymm_i;
-    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
-    // act_gate(o) or act_gate(ct * wp2 + o)
-    if (use_peephole_) {
-      vmulps(ymm_wp2, ymm_ct, ymm_wp2);
-      vaddps(ymm_o, ymm_o, ymm_wp2);
-    }
-    act<ymm_t>(ymm_o, ymm_o, act_gate_);
-    // ht
-    vmulps(ymm_o, ymm_o, ymm_tmp);
-    // save ct and ht
-    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
-    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-
-  if (use_peephole_) {
-    postCode();
-  } else {
-    ret();
-  }
-}
-
-bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
-
-void GRUJitCode::generate() {
-  reg64_t reg_ptr_gates = rax;
-  reg64_t reg_ptr_ht_1 = r9;
-  reg64_t reg_ptr_ht = r10;
-  mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]);
-  mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]);
-  mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]);
-  ymm_t ymm_one = ymm_t(0);
-
-  if (id_ == 2) {
-    reg64_t reg_ptr_tmp = r11;
-    mov(reg_ptr_tmp, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
-  }
-  int offset = 0;
-  int d = num_ * sizeof(float);
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    ymm_t ymm_u = ymm_t(1);
-    ymm_t ymm_r = ymm_t(2);
-    ymm_t ymm_s = ymm_t(3);
-    ymm_t ymm_ht_1 = ymm_t(4);
-    // W: {W_update, W_reset; W_state}
-    if (id_ == 0 || id_ == 2) {
-      vmovups(ymm_u, ptr[reg_ptr_gates + offset]);
-      vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]);
-    }
-    if (id_ == 1) {
-      vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]);
-    }
-    if (id_ == 1 || id_ == 2) {
-      vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]);
-    }
-
-    if (id_ == 0) {
-      // ht = act_gate(u) * act_cand(s)
-      act<ymm_t>(ymm_u, ymm_u, act_gate_);
-      act<ymm_t>(ymm_s, ymm_s, act_cand_);
-      vmulps(ymm_s, ymm_s, ymm_u);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_s);
-    } else if (id_ == 1) {
-      // ht = act_gate(r) * ht_1
-      act<ymm_t>(ymm_r, ymm_r, act_gate_);
-      vmulps(ymm_r, ymm_r, ymm_ht_1);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_r);
-    } else if (id_ == 2) {
-      // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
-      ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx());
-      act<ymm_t>(ymm_u, ymm_u, act_gate_);
-      act<ymm_t>(ymm_s, ymm_s, act_cand_);
-      vmulps(ymm_s, ymm_s, ymm_u);
-      vsubps(ymm_u, ymm_one_inner, ymm_u);
-      vmulps(ymm_u, ymm_ht_1, ymm_u);
-      vaddps(ymm_u, ymm_s, ymm_u);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_u);
-    }
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-
-  ret();
-}
-}  // namespace gen
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_gen.cc b/paddle/fluid/operators/math/jit_gen.cc
deleted file mode 100644
index 5c6672928e8c03ccb1920bd828f785084e422fc2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_gen.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_gen.h"
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include "paddle/fluid/platform/cpu_info.h"
-
-DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-namespace gen {
-
-constexpr Xbyak::Operand::Code g_abi_regs[] = {
-    Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
-    Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15};
-
-constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]);
-
-void JitCode::preCode() {
-  for (int i = 0; i < num_g_abi_regs; ++i) {
-    push(Xbyak::Reg64(g_abi_regs[i]));
-  }
-  if (platform::MayIUse(platform::avx512f)) {
-    mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
-  }
-}
-
-void JitCode::postCode() {
-  for (int i = 0; i < num_g_abi_regs; ++i) {
-    pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i]));
-  }
-  ret();
-}
-
-void JitCode::dumpCode(const Xbyak::uint8 *code) const {
-  if (code) {
-    static int counter = 0;
-    std::ostringstream filename;
-    filename << "paddle_jitcode_" << name() << "." << counter << ".bin";
-    counter++;
-    std::ofstream fout(filename.str(), std::ios::out);
-    if (fout.is_open()) {
-      fout.write(reinterpret_cast<const char *>(code), getSize());
-      fout.close();
-    }
-  }
-}
-
-Xbyak::Address JitCode::EVEX_compress_addr(Xbyak::Reg64 base, int offt,
-                                           bool bcast) {
-  int scale = 0;
-  if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
-    offt = offt - 2 * EVEX_max_8b_offt;
-    scale = 1;
-  } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
-    offt = offt - 4 * EVEX_max_8b_offt;
-    scale = 2;
-  }
-  auto re = Xbyak::RegExp() + base + offt;
-  if (scale) {
-    re = re + reg_EVEX_max_8b_offt * scale;
-  }
-  if (bcast) {
-    return zword_b[re];
-  } else {
-    return zword[re];
-  }
-}
-
-}  // namespace gen
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h
deleted file mode 100644
index 6abf3434cc8d8f6ab2838ef822a4f6b948331802..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_gen.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <gflags/gflags.h>
-#include <type_traits>
-#include "paddle/fluid/platform/macros.h"
-
-#define XBYAK_USE_MMAP_ALLOCATOR
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-DECLARE_bool(dump_jitcode);
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-namespace gen {
-
-#define DECLARE_JIT_CODE(codename) \
-  const char *name() const override { return #codename; }
-
-// Application Binary Interface
-constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
-    abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
-    abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX);
-
-class JitCode : public Xbyak::CodeGenerator {
- public:
-  explicit JitCode(size_t code_size = 256 * 1024, void *code_ptr = nullptr)
-      : Xbyak::CodeGenerator(code_size, code_ptr) {}
-
-  virtual ~JitCode() {}
-  virtual const char *name() const = 0;
-  virtual void generate() = 0;
-
-  template <typename FUNC>
-  const FUNC getCode() {
-    this->generate();
-    const Xbyak::uint8 *code = CodeGenerator::getCode();
-    if (FLAGS_dump_jitcode) {
-      this->dumpCode(code);
-    }
-    return reinterpret_cast<const FUNC>(code);
-  }
-  DISABLE_COPY_AND_ASSIGN(JitCode);
-
- protected:
-  Xbyak::Reg64 param1{abi_param1};
-  const int EVEX_max_8b_offt = 0x200;
-  const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
-
-  void preCode();
-  void postCode();
-  void dumpCode(const Xbyak::uint8 *code) const;
-  void L(const char *label) { Xbyak::CodeGenerator::L(label); }
-  void L(const Xbyak::Label &label) { Xbyak::CodeGenerator::L(label); }
-  // Enhanced vector extension
-  Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
-                                    bool bcast = false);
-};
-
-}  // namespace gen
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc
deleted file mode 100644
index 118696ba47986e2dbf97535333c9817b7c264a54..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <iostream>
-#include <string>
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-KernelPool& KernelPool::Instance() {
-  static thread_local KernelPool g_jit_kernels;
-  return g_jit_kernels;
-}
-
-std::shared_ptr<const Kernel> KernelPool::Get(const std::string& key) const {
-  if (kers_.find(key) == kers_.end()) {
-    return nullptr;
-  }
-  return kers_.at(key);
-}
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
deleted file mode 100644
index b78b92b4f97b761654a5b9b178f96c1dc99f7789..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <memory>  // for shared_ptr
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/operators/math/jit_kernel_impl.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/macros.h"
-
-// Note: Only support on CPU yet.
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-// TODO(TJ): remove me
-typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
-
-class Kernel {
- public:
-  Kernel() = default;
-  virtual ~Kernel() = default;
-  // TODO(TJ): below members should be deprecated.
-  int num_{0};
-  int end_{0};
-  int rest_{0};
-  DISABLE_COPY_AND_ASSIGN(Kernel);
-};
-
-class KernelPool {
- public:
-  static KernelPool &Instance();
-
-  template <typename Ker, typename... ARGS>
-  std::shared_ptr<const Ker> Get(ARGS... args);
-
-  std::shared_ptr<const Kernel> Get(const std::string &key) const;
-
- private:
-  KernelPool() = default;
-  std::unordered_map<std::string, std::shared_ptr<const Kernel>> kers_;
-
-  DISABLE_COPY_AND_ASSIGN(KernelPool);
-};
-
-template <typename T>
-class VMulKernel : public Kernel {
- public:
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-template <typename T>
-class VAddKernel : public Kernel {
- public:
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-template <typename T>
-class VAddReluKernel : public Kernel {
- public:
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-template <typename T>
-class VScalKernel : public Kernel {
- public:
-  // y = a.*x
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-template <typename T>
-class VAddBiasKernel : public Kernel {
- public:
-  // y = a.+x
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-#ifdef PADDLE_WITH_MKLDNN
-template <typename T>
-class EltwiseMulnChw16cNCKernel : public Kernel {
- public:
-  // nChw16c = nChw16c .* NC
-  void (*Compute)(const float *, const float *, float *, int, int);
-};
-#endif
-
-template <typename T>
-class VActKernel : public Kernel {
- public:
-  void (*Compute)(const T *, T *, int);
-};
-
-template <typename T>
-class VReluKernel : public VActKernel<T> {};
-
-template <typename T>
-class VIdentityKernel : public VActKernel<T> {};
-
-template <typename T>
-class VExpKernel : public VActKernel<T> {};
-
-template <typename T>
-class VSigmoidKernel : public VActKernel<T> {};
-
-template <typename T>
-class VTanhKernel : public VActKernel<T> {};
-
-template <typename T>
-class LSTMKernel : public Kernel {
- public:
-  // compute c1 and h1 without c0 or h0
-  void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *);
-  void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *);
-};
-
-template <typename T>
-class GRUKernel : public Kernel {
- public:
-  // compute h1 without h0
-  void (*ComputeH1)(gru_t *, const gru_attr_t *);
-  void (*ComputeHtPart1)(gru_t *, const gru_attr_t *);
-  void (*ComputeHtPart2)(gru_t *, const gru_attr_t *);
-};
-
-template <typename T>
-class CRFDecodeKernel : public Kernel {
- public:
-  virtual void Compute(const int seq_len, const T *x, const T *w, T *alpha,
-                       int *track) const = 0;
-};
-
-template <typename T>
-class LayerNormKernel : public Kernel {
- public:
-  virtual void Compute(T *x, T *out, T *mean, T *var, const T *scale,
-                       const T *bias, int height,
-                       const float epsilon) const = 0;
-};
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
deleted file mode 100644
index 8cf588efba52314650bfd376b95b10e6d4336b2e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ /dev/null
@@ -1,396 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <string>
-#include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#include "paddle/fluid/operators/math/jit_kernel_refer.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#ifdef PADDLE_WITH_XBYAK
-#include "paddle/fluid/operators/math/jit_code.h"
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-#ifdef PADDLE_WITH_MKLML
-template <typename T>
-void VMulMKL(const T* x, const T* y, T* z, int n);
-
-template <>
-void VMulMKL<float>(const float* x, const float* y, float* z, int n) {
-  platform::dynload::vsMul(n, x, y, z);
-}
-
-template <>
-void VMulMKL<double>(const double* x, const double* y, double* z, int n) {
-  platform::dynload::vdMul(n, x, y, z);
-}
-
-template <typename T>
-void VAddMKL(const T* x, const T* y, T* z, int n);
-
-template <>
-void VAddMKL<float>(const float* x, const float* y, float* z, int n) {
-  platform::dynload::vsAdd(n, x, y, z);
-}
-
-template <>
-void VAddMKL<double>(const double* x, const double* y, double* z, int n) {
-  platform::dynload::vdAdd(n, x, y, z);
-}
-
-template <typename T>
-void VScalMKL(const T* a, const T* x, T* y, int n);
-
-template <>
-void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
-  if (x == y) {
-    platform::dynload::cblas_sscal(n, *a, y, 1);
-  } else {
-    refer::VScal<float>(a, x, y, n);
-  }
-}
-
-template <>
-void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
-  if (x == y) {
-    platform::dynload::cblas_dscal(n, *a, y, 1);
-  } else {
-    refer::VScal<double>(a, x, y, n);
-  }
-}
-
-#endif
-
-/* VMUL JitKernel */
-template <typename T>
-class VMulKernelImpl : public VMulKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VMulKernelImpl(int d) : VMulKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      // roughly estimate the size of code
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
-      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 0, false,
-                                         sz > 4096 ? sz : 4096));
-      this->Compute =
-          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
-      return;
-    }
-#endif
-#ifdef PADDLE_WITH_MKLML
-    if (useMKL(d)) {
-      this->Compute = VMulMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VMul<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VMulKernelImpl<float>::useJIT(int d) {
-  return gen::VXXJitCode::init(d);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VMulKernelImpl<float>::useMKL(int d) {
-  return platform::MayIUse(platform::avx512f) && d > 512;
-}
-
-template <>
-bool VMulKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-#endif
-
-/* VAdd JitKernel */
-template <typename T>
-class VAddKernelImpl : public VAddKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VAddKernelImpl(int d) : VAddKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
-      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, false,
-                                         sz > 4096 ? sz : 4096));
-      this->Compute =
-          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
-      return;
-    }
-#endif
-#ifdef PADDLE_WITH_MKLML
-    if (useMKL(d)) {
-      this->Compute = VAddMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VAdd<T>;
-  }
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VAddKernelImpl<float>::useJIT(int d) {
-  return gen::VXXJitCode::init(d);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VAddKernelImpl<float>::useMKL(int d) {
-  return d > 512;
-}
-
-template <>
-bool VAddKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-/* EltwiseMul for nChw16c & NC inputs JitKernel */
-template <typename T>
-class EltwiseMulnChw16cNCKernelImpl
-    : public math::jitkernel::EltwiseMulnChw16cNCKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit EltwiseMulnChw16cNCKernelImpl(int d)
-      : EltwiseMulnChw16cNCKernel<T>() {
-    using mul_func_t = void (*)(const float*, const float*, float*, int, int);
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      // roughly estimate the size of code
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
-      sz = sz > 4096 ? sz : 4096;
-      jitcode_.reset(new gen::EltwiseMulnChw16cNC(sz));
-      this->Compute = (mul_func_t)jitcode_->getCode();
-      return;
-    }
-#endif
-    PADDLE_THROW(
-        "This kernel shouldn't be used in Non-Xbyak, Non-MKL-DNN "
-        "environemnt");
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::EltwiseMulnChw16cNC> jitcode_{nullptr};
-};
-
-template <>
-bool EltwiseMulnChw16cNCKernelImpl<float>::useJIT(int d) {
-  return true;
-}
-#endif
-#endif
-
-/* VAddRelu JitKernel */
-template <typename T>
-class VAddReluKernelImpl : public VAddReluKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
-      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, true,
-                                         sz > 4096 ? sz : 4096));
-      this->Compute =
-          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
-      return;
-    }
-#endif
-    this->Compute = refer::VAddRelu<T>;
-  }
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VAddReluKernelImpl<float>::useJIT(int d) {
-  return gen::VXXJitCode::init(d);
-}
-#endif
-
-/* VScal JitKernel */
-template <typename T>
-class VScalKernelImpl : public VScalKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VScalKernelImpl(int d) : VScalKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
-      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 1, false,
-                                         sz > 4096 ? sz : 4096));
-      this->Compute =
-          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
-      return;
-    }
-#endif
-#ifdef PADDLE_WITH_MKLML
-    if (useMKL(d)) {
-      this->Compute = VScalMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VScal<T>;
-  }
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VScalKernelImpl<float>::useJIT(int d) {
-  return gen::VXXJitCode::init(d, 1);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VScalKernelImpl<float>::useMKL(int d) {
-  return d > 512;
-}
-template <>
-bool VScalKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-#endif
-
-/* VAddBias JitKernel */
-template <typename T>
-class VAddBiasKernelImpl : public VAddBiasKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
-      jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 1, false,
-                                         sz > 4096 ? sz : 4096));
-      this->Compute =
-          jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
-      return;
-    }
-#endif
-
-    this->Compute = refer::VAddBias<T>;
-  }
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VXXJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VAddBiasKernelImpl<float>::useJIT(int d) {
-  return gen::VXXJitCode::init(d, 1);
-}
-#endif
-
-/* VRelu JitKernel */
-template <typename T>
-class VReluKernelImpl : public VReluKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VReluKernelImpl(int d) : VReluKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 /* init size */ +
-                  d / YMM_FLOAT_BLOCK * 4 /* instructions */ *
-                      8 /* average bytes for each instruction */;
-      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::relu,
-                                          sz > 4096 ? sz : 4096));
-      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
-      return;
-    }
-#endif
-
-    this->Compute = refer::VRelu<T>;
-  }
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VReluKernelImpl<float>::useJIT(int d) {
-  return gen::VActJitCode::init(d, gen::operand_type::relu);
-}
-#endif
-
-/* An empty JitKernel */
-template <typename T>
-class VIdentityKernelImpl : public VIdentityKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() {
-    this->Compute = refer::VIdentity<T>;
-  }
-};
-
-REGISTER_JITKERNEL(vmul, VMulKernel);
-REGISTER_JITKERNEL(vadd, VAddKernel);
-REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
-REGISTER_JITKERNEL(vscal, VScalKernel);
-REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
-REGISTER_JITKERNEL(vrelu, VReluKernel);
-REGISTER_JITKERNEL(videntity, VIdentityKernel);
-#ifdef PADDLE_WITH_MKLDNN
-REGISTER_JITKERNEL(eltwise_mul_nchw16c, EltwiseMulnChw16cNCKernel);
-#endif
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
deleted file mode 100644
index ac2d29f1c18392ebf917cc097e63670e06b1eded..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <limits>
-#include <string>
-#include "paddle/fluid/operators/math/jit_kernel_macro.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-/* CRF Decode JitKernel */
-template <typename T, platform::cpu_isa_t isa, jit_block>
-class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
- public:
-  explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() {
-    this->num_ = tag_num;
-  }
-  void Compute(const int seq_len, const T* x, const T* w, T* alpha,
-               int* track) const override {
-    constexpr int state_trans_base_idx = 2;
-    for (int i = 0; i < this->num_; ++i) {
-      alpha[i] = w[i] + x[i];
-    }
-    for (int k = 1; k < seq_len; ++k) {
-      for (int i = 0; i < this->num_; ++i) {
-        T max_score = -std::numeric_limits<T>::max();
-        int max_j = 0;
-        for (int j = 0; j < this->num_; ++j) {
-          T score = alpha[(k - 1) * this->num_ + j] +
-                    w[(j + state_trans_base_idx) * this->num_ + i];
-          if (score > max_score) {
-            max_score = score;
-            max_j = j;
-          }
-        }
-        alpha[k * this->num_ + i] = max_score + x[k * this->num_ + i];
-        track[k * this->num_ + i] = max_j;
-      }
-    }
-  }
-};
-
-#define INIT_ALPHA(step_size)                                               \
-  /* Setup the alpha initial value.*/                                       \
-  int i_offset = 0;                                                         \
-  int last_offset = this->rest_ - step_size;                                \
-  for (int i = 0; i <= this->end_; ++i) {                                   \
-    /* weights, input and alpha values. */                                  \
-    __m256 w_content, x_content, alpha_content;                             \
-    /* Load the relevant data into the variables from un-aligned address.*/ \
-    w_content = _mm256_loadu_ps(w + i_offset);                              \
-    x_content = _mm256_loadu_ps(x + i_offset);                              \
-    alpha_content = _mm256_add_ps(w_content, x_content);                    \
-    _mm256_storeu_ps(alpha + i_offset, alpha_content);                      \
-    i_offset += step_size;                                                  \
-    if (i == this->end_ - 1) {                                              \
-      if (this->rest_ > 0) {                                                \
-        i_offset += last_offset;                                            \
-      } else {                                                              \
-        break;                                                              \
-      }                                                                     \
-    }                                                                       \
-  }
-
-#define UPDATE_ALPHA(step_size)                                               \
-  /* Update the alpha and track values. */                                    \
-  __m256 x_content = _mm256_loadu_ps(x + seq_offset + this->num_ + j_offset); \
-  max_score = _mm256_add_ps(max_score, x_content);                            \
-  _mm256_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score);    \
-  _mm256_storeu_si256(                                                        \
-      reinterpret_cast<__m256i*>(track + seq_offset + this->num_ + j_offset), \
-      max_j);                                                                 \
-  /* Calculate the offset of next step*/                                      \
-  j_offset += step_size;                                                      \
-  if (j == this->end_ - 1) {                                                  \
-    if (this->rest_ > 0) {                                                    \
-      j_offset += last_offset;                                                \
-    } else {                                                                  \
-      break;                                                                  \
-    }                                                                         \
-  }
-
-#define INTRIAVX_FLOAT(block)                                                  \
-  template <>                                                                  \
-  CRFDecodeKernelImpl<float, platform::avx, block>::CRFDecodeKernelImpl(       \
-      int tag_num)                                                             \
-      : CRFDecodeKernel<float>() {                                             \
-    this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / YMM_FLOAT_BLOCK;                                 \
-    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
-  }                                                                            \
-  template <>                                                                  \
-  void CRFDecodeKernelImpl<float, platform::avx, block>::Compute(              \
-      const int seq_len, const float* x, const float* w, float* alpha,         \
-      int* track) const {                                                      \
-    INIT_ALPHA(YMM_FLOAT_BLOCK)                                                \
-    /* Use the column-major strategy to get the location of maximum score.*/   \
-    int seq_offset = 0;                                                        \
-    constexpr int state_trans_base_idx = 2;                                    \
-    for (int k = 1; k < seq_len; ++k) {                                        \
-      int j_offset = 0;                                                        \
-      for (int j = 0; j <= this->end_; ++j) {                                  \
-        /* Initialize the variables of maximum score and location.*/           \
-        __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max()); \
-        __m256i max_j = _mm256_set1_epi32(0);                                  \
-        /* Calculate the offset of transition_weights.*/                       \
-        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
-        for (int i = 0; i < this->num_; ++i) {                                 \
-          /* Initalize the content of alpha variable with related offset.*/    \
-          __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);  \
-          /* Obtain the content of weights from un-aligned address.*/          \
-          __m256 w_content = _mm256_loadu_ps(w + trans_offset);                \
-          __m256 score_v = _mm256_add_ps(alpha_content, w_content);            \
-          __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);         \
-          /* According to the mask value, update the index of the max_score.*/ \
-          /* AVX instructions.*/                                               \
-          __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);               \
-          __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);               \
-          __m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0);     \
-          __m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1);     \
-          lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);                      \
-          hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);                      \
-          lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));                 \
-          hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));                 \
-          lo_max_j = _mm_or_si128(lo_mask, lo_max_j);                          \
-          hi_max_j = _mm_or_si128(hi_mask, hi_max_j);                          \
-          max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);                 \
-          max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);                 \
-          /* AVX done*/                                                        \
-          /* Update the max_score value.*/                                     \
-          max_score = _mm256_max_ps(max_score, score_v);                       \
-          trans_offset += this->num_;                                          \
-        }                                                                      \
-        UPDATE_ALPHA(YMM_FLOAT_BLOCK)                                          \
-      }                                                                        \
-      seq_offset += this->num_;                                                \
-    }                                                                          \
-  }
-
-#define INTRIAVX2_FLOAT(isa, block)                                            \
-  template <>                                                                  \
-  CRFDecodeKernelImpl<float, isa, block>::CRFDecodeKernelImpl(int tag_num)     \
-      : CRFDecodeKernel<float>() {                                             \
-    this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / YMM_FLOAT_BLOCK;                                 \
-    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
-  }                                                                            \
-  template <>                                                                  \
-  void CRFDecodeKernelImpl<float, isa, block>::Compute(                        \
-      const int seq_len, const float* x, const float* w, float* alpha,         \
-      int* track) const {                                                      \
-    INIT_ALPHA(YMM_FLOAT_BLOCK)                                                \
-    /* Use the column-major strategy to get the location of maximum score.*/   \
-    int seq_offset = 0;                                                        \
-    constexpr int state_trans_base_idx = 2;                                    \
-    for (int k = 1; k < seq_len; ++k) {                                        \
-      int j_offset = 0;                                                        \
-      for (int j = 0; j <= this->end_; ++j) {                                  \
-        /* Initialize the variables of maximum score and location.*/           \
-        __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max()); \
-        __m256i max_j = _mm256_set1_epi32(0);                                  \
-        /* Calculate the offset of transition_weights.*/                       \
-        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
-        for (int i = 0; i < this->num_; ++i) {                                 \
-          /* Initalize the content of alpha variable with related offset.*/    \
-          __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);  \
-          /* Obtain the content of weights from un-aligned address.*/          \
-          __m256 w_content = _mm256_loadu_ps(w + trans_offset);                \
-          __m256 score_v = _mm256_add_ps(alpha_content, w_content);            \
-          __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);         \
-          /* According to the mask value, update the index of the max_score.*/ \
-          /* AVX2 instructions.*/                                              \
-          max_j = _mm256_or_si256(                                             \
-              _mm256_andnot_si256((__m256i)mask, max_j),                       \
-              _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));          \
-          /* Update the max_score value.*/                                     \
-          max_score = _mm256_max_ps(max_score, score_v);                       \
-          trans_offset += this->num_;                                          \
-        }                                                                      \
-        UPDATE_ALPHA(YMM_FLOAT_BLOCK)                                          \
-      }                                                                        \
-      seq_offset += this->num_;                                                \
-    }                                                                          \
-  }
-
-#define INTRIAVX512_FLOAT(block)                                               \
-  template <>                                                                  \
-  CRFDecodeKernelImpl<float, platform::avx512f, block>::CRFDecodeKernelImpl(   \
-      int tag_num)                                                             \
-      : CRFDecodeKernel<float>() {                                             \
-    this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / ZMM_FLOAT_BLOCK;                                 \
-    this->rest_ = this->num_ % ZMM_FLOAT_BLOCK;                                \
-  }                                                                            \
-  template <>                                                                  \
-  void CRFDecodeKernelImpl<float, platform::avx512f, block>::Compute(          \
-      const int seq_len, const float* x, const float* w, float* alpha,         \
-      int* track) const {                                                      \
-    INIT_ALPHA(ZMM_FLOAT_BLOCK)                                                \
-    /* Use the column-major strategy to get the location of maximum score.*/   \
-    int seq_offset = 0;                                                        \
-    constexpr int state_trans_base_idx = 2;                                    \
-    for (int k = 1; k < seq_len; ++k) {                                        \
-      int j_offset = 0;                                                        \
-      for (int j = 0; j <= this->end_; ++j) {                                  \
-        /* Initialize the variables of maximum score and location.*/           \
-        __m512 max_score = _mm512_set1_ps(-std::numeric_limits<float>::max()); \
-        __m512i max_j = _mm512_setzero_si512();                                \
-        /* Calculate the offset of transition_weights.*/                       \
-        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
-        for (int i = 0; i < this->num_; ++i) {                                 \
-          /* Initalize the content of alpha variable with related offset.*/    \
-          __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i));    \
-          /* Obtain the content of weights from un-aligned address.*/          \
-          __m512 w_content = _mm512_loadu_ps(w + trans_offset);                \
-          __m512 score_v = _mm512_add_ps(alpha_content, w_content);            \
-          __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); \
-          /* AVX512 instructions.*/                                            \
-          max_j = _mm512_mask_set1_epi32(max_j, mask, i);                      \
-          /* Update the max_score value.*/                                     \
-          max_score = _mm512_max_ps(max_score, score_v);                       \
-          trans_offset += this->num_;                                          \
-        }                                                                      \
-        /* Update the alpha and track values.*/                                \
-        __m512 x_content =                                                     \
-            _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset);           \
-        max_score = _mm512_add_ps(max_score, x_content);                       \
-        _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset,           \
-                         max_score);                                           \
-        _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset +    \
-                                                       this->num_ + j_offset), \
-                            max_j);                                            \
-        /* Calculate the offset of next step*/                                 \
-        j_offset += ZMM_FLOAT_BLOCK;                                           \
-        if (j == this->end_ - 1) {                                             \
-          if (this->rest_ > 0) {                                               \
-            j_offset += last_offset;                                           \
-          } else {                                                             \
-            break;                                                             \
-          }                                                                    \
-        }                                                                      \
-      }                                                                        \
-      seq_offset += this->num_;                                                \
-    }                                                                          \
-  }
-
-#ifdef __AVX__
-INTRIAVX_FLOAT(kEQ8);
-INTRIAVX_FLOAT(kGT8LT16);
-INTRIAVX_FLOAT(kEQ16);
-INTRIAVX_FLOAT(kGT16);
-#endif
-#ifdef __AVX2__
-INTRIAVX2_FLOAT(platform::avx2, kEQ8);
-INTRIAVX2_FLOAT(platform::avx2, kGT8LT16);
-INTRIAVX2_FLOAT(platform::avx2, kEQ16);
-INTRIAVX2_FLOAT(platform::avx2, kGT16);
-#endif
-#ifdef __AVX512F__
-INTRIAVX2_FLOAT(platform::avx512f, kEQ8);
-INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16);
-INTRIAVX512_FLOAT(kEQ16);
-INTRIAVX512_FLOAT(kGT16);
-#endif
-
-#undef INTRIAVX512_FLOAT
-#undef INTRIAVX2_FLOAT
-#undef INTRIAVX_FLOAT
-#undef INIT_ALPHA
-#undef UPDATE_ALPHA
-
-REGISTER_JITKERNEL_DEPRECATED(crf_decode, CRFDecodeKernel);
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
deleted file mode 100644
index 7945cfb253a61b7d1191c39537254126e2bb85dd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <string>
-#include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#include "paddle/fluid/operators/math/jit_kernel_refer.h"
-
-#ifdef PADDLE_WITH_XBYAK
-#include "paddle/fluid/operators/math/jit_code.h"
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-#ifdef PADDLE_WITH_MKLML
-// try to use MKL to speedup
-template <typename T>
-void VExpMKL(const T* x, T* y, int n);
-
-template <>
-void VExpMKL<float>(const float* x, float* y, int n) {
-  platform::dynload::vsExp(n, x, y);
-}
-
-template <>
-void VExpMKL<double>(const double* x, double* y, int n) {
-  platform::dynload::vdExp(n, x, y);
-}
-
-template <typename T>
-void VSigmoidMKL(const T* x, T* y, int n) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  VExpMKL(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-
-template <typename T>
-void VTanhMKL(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoidMKL(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-#endif
-
-/* VExp JitKernel */
-template <typename T>
-class VExpKernelImpl : public VExpKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VExpKernelImpl(int d) : VExpKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 70 * 8;
-      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::exp,
-                                          sz > 4096 ? sz : 4096));
-      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
-      return;
-    }
-#endif
-#ifdef PADDLE_WITH_MKLML
-    if (useMKL(d)) {
-      this->Compute = VExpMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VExp<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VExpKernelImpl<float>::useJIT(int d) {
-  return gen::VActJitCode::init(d, gen::operand_type::exp);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VExpKernelImpl<float>::useMKL(int d) {
-  return d > 512;
-}
-
-template <>
-bool VExpKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-
-#endif
-
-/* VSigmoid JitKernel */
-template <typename T>
-class VSigmoidKernelImpl : public VSigmoidKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VSigmoidKernelImpl(int d) : VSigmoidKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 82 * 8;
-      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::sigmoid,
-                                          sz > 4096 ? sz : 4096));
-      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
-      return;
-    }
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-    // strictly it's a better impl with MKL, then is refer
-    if (useMKL(d)) {
-      this->Compute = VSigmoidMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VSigmoid<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VSigmoidKernelImpl<float>::useJIT(int d) {
-  return gen::VActJitCode::init(d, gen::operand_type::sigmoid);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VSigmoidKernelImpl<float>::useMKL(int d) {
-  return d > 512;
-}
-
-template <>
-bool VSigmoidKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-#endif
-
-/* VTanh JitKernel */
-template <typename T>
-class VTanhKernelImpl : public VTanhKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VTanhKernelImpl(int d) : VTanhKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 84 * 8;
-      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::tanh,
-                                          sz > 4096 ? sz : 4096));
-      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
-      return;
-    }
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-    // strictly it's a better impl with MKL, then is refer
-    if (useMKL(d)) {
-      this->Compute = VTanhMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VTanh<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VTanhKernelImpl<float>::useJIT(int d) {
-  return gen::VActJitCode::init(d, gen::operand_type::tanh);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VTanhKernelImpl<float>::useMKL(int d) {
-  return d > 512;
-}
-
-template <>
-bool VTanhKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-#endif
-
-REGISTER_JITKERNEL(vexp, VExpKernel);
-REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
-REGISTER_JITKERNEL(vtanh, VTanhKernel);
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h
deleted file mode 100644
index ba5f20e53383d3cafab4239f1a2d911addf1ae23..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <type_traits>
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
-typedef struct {
-  void* gates;  // gates: W_ch, W_ih, W_fh, W_oh
-  const void* ct_1;
-  void* ct;
-  void* ht;
-  /* weight_peephole and checked data are only used in peephole*/
-  const void* wp{nullptr};
-  void* checked{nullptr};
-} lstm_t;
-
-typedef struct {
-  void* gates;  // gates: {W_update, W_reset; W_state}
-  const void* ht_1;
-  void* ht;
-} gru_t;
-
-struct rnn_attr_s {
-  int d;
-  std::string act_gate, act_cand;
-  rnn_attr_s() = default;
-  rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand)
-      : d(_d), act_gate(_act_gate), act_cand(_act_cand) {}
-};
-
-struct lstm_attr_s : public rnn_attr_s {
-  bool use_peephole;
-  std::string act_cell;
-  lstm_attr_s() = default;
-  lstm_attr_s(int _d, const std::string& _act_gate,
-              const std::string& _act_cand, const std::string& _act_cell,
-              bool _use_peephole = false)
-      : rnn_attr_s(_d, _act_gate, _act_cand),
-        use_peephole(_use_peephole),
-        act_cell(_act_cell) {}
-};
-
-typedef struct rnn_attr_s gru_attr_t;
-typedef struct lstm_attr_s lstm_attr_t;
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
deleted file mode 100644
index e21092037a27d26cd31205b1b5d8e2f0cb8380cd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ /dev/null
@@ -1,239 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <math.h>
-#include <limits>
-#include <string>
-#include "paddle/fluid/operators/math/jit_kernel_macro.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-/* Layer Norm JitKernel */
-template <typename T, platform::cpu_isa_t isa, jit_block>
-class LayerNormKernelImpl : public LayerNormKernel<T> {
- public:
-  explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() {
-    this->num_ = right;
-  }
-
-  void Compute(T* x, T* out, T* mean, T* var, const T* scale, const T* bias,
-               int height, const float epsilon) const override {
-    // get mean
-    for (int i = 0; i < height; i++) {
-      T sum = 0.0;
-      int offset = i * this->num_;
-      for (int j = 0; j < this->num_; j++) {
-        sum += x[offset + j];
-      }
-      mean[i] = sum / this->num_;
-    }
-
-    // get variance
-    for (int i = 0; i < height; i++) {
-      T sum = 0.0;
-      int offset = i * this->num_;
-      for (int j = 0; j < this->num_; j++) {
-        sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]);
-      }
-      var[i] = sum / this->num_;
-    }
-
-    for (int i = 0; i < height; i++) {
-      int offset = i * this->num_;
-      T sqrt_var = sqrt(var[i] + (T)epsilon);
-      for (int j = 0; j < this->num_; j++) {
-        out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var;
-      }
-    }
-    if (scale) {
-      for (int i = 0; i < height; i++) {
-        int offset = i * this->num_;
-        for (int j = 0; j < this->num_; j++) {
-          out[offset + j] *= scale[j];
-        }
-      }
-    }
-
-    if (bias) {
-      for (int i = 0; i < height; i++) {
-        int offset = i * this->num_;
-        for (int j = 0; j < this->num_; j++) {
-          out[offset + j] += bias[j];
-        }
-      }
-    }
-  }
-};
-
-#define INTRIAVX_FLOAT(isa, jit_block)                                         \
-  template <>                                                                  \
-  LayerNormKernelImpl<float, isa, jit_block>::LayerNormKernelImpl(int right)   \
-      : LayerNormKernel<float>() {                                             \
-    this->num_ = right;                                                        \
-    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
-    this->end_ = this->num_ - this->rest_;                                     \
-  }                                                                            \
-  template <>                                                                  \
-  void LayerNormKernelImpl<float, isa, jit_block>::Compute(                    \
-      float* x, float* out, float* mean, float* var, const float* scale,       \
-      const float* bias, int height, const float epsilon) const {              \
-    __m256 sum;                                                                \
-    __m256 mean_vec, var_vec;                                                  \
-    __m128 hi, lo;                                                             \
-    __m256 tmp;                                                                \
-    size_t offset;                                                             \
-    size_t j;                                                                  \
-    size_t block = YMM_FLOAT_BLOCK;                                            \
-    __m256 reverse_num_vec =                                                   \
-        _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(this->num_));        \
-    __m256 epsilon_vec = _mm256_set1_ps(epsilon);                              \
-    int rest_mask =                                                            \
-        ((-1) & (~((~0U) >> (sizeof(int) * 8 - (YMM_FLOAT_BLOCK - rest_))))) & \
-        0x0ff;                                                                 \
-    __m256i mask_vec = _mm256_set_epi32(                                       \
-        rest_mask & 0x80 ? 0xffffffff : 0, rest_mask & 0x40 ? 0xffffffff : 0,  \
-        rest_mask & 0x20 ? 0xffffffff : 0, rest_mask & 0x10 ? 0xffffffff : 0,  \
-        rest_mask & 0x8 ? 0xffffffff : 0, rest_mask & 0x4 ? 0xffffffff : 0,    \
-        rest_mask & 0x2 ? 0xffffffff : 0, rest_mask & 0x1 ? 0xffffffff : 0);   \
-                                                                               \
-    for (int i = 0; i < height; ++i) {                                         \
-      offset = i * this->num_;                                                 \
-                                                                               \
-      /* get mean */                                                           \
-      sum = _mm256_setzero_ps();                                               \
-      for (j = offset; j < end_ + offset; j += block) {                        \
-        sum = _mm256_add_ps(sum, _mm256_loadu_ps((const float*)x + j));        \
-      }                                                                        \
-      if (rest_ != 0) {                                                        \
-        j = offset + this->num_ - block;                                       \
-        tmp = _mm256_loadu_ps((const float*)x + j);                            \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
-        sum = _mm256_add_ps(sum, tmp);                                         \
-      }                                                                        \
-      hi = _mm256_extractf128_ps(sum, 1);                                      \
-      lo = _mm256_extractf128_ps(sum, 0);                                      \
-      sum = _mm256_add_ps(                                                     \
-          sum, _mm256_insertf128_ps(                                           \
-                   _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));  \
-      sum = _mm256_hadd_ps(sum, sum);                                          \
-      sum = _mm256_hadd_ps(sum, sum);                                          \
-      mean_vec = _mm256_mul_ps(sum, reverse_num_vec);                          \
-      mean[i] = *reinterpret_cast<float*>(&mean_vec);                          \
-                                                                               \
-      /* get variance */                                                       \
-      sum = _mm256_setzero_ps();                                               \
-      for (j = offset; j < end_ + offset; j += block) {                        \
-        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
-        tmp = _mm256_mul_ps(tmp, tmp);                                         \
-        sum = _mm256_add_ps(sum, tmp);                                         \
-      }                                                                        \
-      if (rest_ != 0) {                                                        \
-        j = offset + this->num_ - block;                                       \
-        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
-        tmp = _mm256_mul_ps(tmp, tmp);                                         \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
-        sum = _mm256_add_ps(sum, tmp);                                         \
-      }                                                                        \
-      hi = _mm256_extractf128_ps(sum, 1);                                      \
-      lo = _mm256_extractf128_ps(sum, 0);                                      \
-      sum = _mm256_add_ps(                                                     \
-          sum, _mm256_insertf128_ps(                                           \
-                   _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));  \
-      sum = _mm256_hadd_ps(sum, sum);                                          \
-      sum = _mm256_hadd_ps(sum, sum);                                          \
-      var_vec = _mm256_mul_ps(sum, reverse_num_vec);                           \
-      var[i] = *reinterpret_cast<float*>(&var_vec);                            \
-                                                                               \
-      /* get x_norm and calculate output*/                                     \
-      for (j = offset; j < end_ + offset; j += block) {                        \
-        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
-        tmp = _mm256_div_ps(                                                   \
-            tmp, _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));         \
-        _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);              \
-      }                                                                        \
-      if (rest_ != 0) {                                                        \
-        j = offset + num_ - block;                                             \
-        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
-        tmp = _mm256_div_ps(                                                   \
-            tmp, _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));         \
-        _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);              \
-      }                                                                        \
-                                                                               \
-      if (scale) {                                                             \
-        if (rest_ != 0) {                                                      \
-          j = offset + this->num_ - block;                                     \
-          tmp = _mm256_loadu_ps((const float*)out + j);                        \
-        }                                                                      \
-        for (j = offset; j < end_ + offset; j += block) {                      \
-          _mm256_storeu_ps(                                                    \
-              reinterpret_cast<float*>(out) + j,                               \
-              _mm256_mul_ps(                                                   \
-                  _mm256_loadu_ps((const float*)out + j),                      \
-                  _mm256_loadu_ps((const float*)scale + j - offset)));         \
-        }                                                                      \
-        if (rest_ != 0) {                                                      \
-          j = offset + this->num_ - block;                                     \
-          _mm256_storeu_ps(                                                    \
-              reinterpret_cast<float*>(out) + j,                               \
-              _mm256_mul_ps(                                                   \
-                  tmp, _mm256_loadu_ps((const float*)scale + j - offset)));    \
-        }                                                                      \
-      }                                                                        \
-                                                                               \
-      if (bias) {                                                              \
-        if (rest_ != 0) {                                                      \
-          j = offset + this->num_ - block;                                     \
-          tmp = _mm256_loadu_ps((const float*)out + j);                        \
-        }                                                                      \
-        for (j = offset; j < end_ + offset; j += block) {                      \
-          _mm256_storeu_ps(                                                    \
-              reinterpret_cast<float*>(out) + j,                               \
-              _mm256_add_ps(                                                   \
-                  _mm256_loadu_ps((const float*)out + j),                      \
-                  _mm256_loadu_ps((const float*)bias + j - offset)));          \
-        }                                                                      \
-        if (rest_ != 0) {                                                      \
-          j = offset + this->num_ - block;                                     \
-          _mm256_storeu_ps(                                                    \
-              reinterpret_cast<float*>(out) + j,                               \
-              _mm256_add_ps(                                                   \
-                  tmp, _mm256_loadu_ps((const float*)bias + j - offset)));     \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  }
-
-#ifdef __AVX__
-INTRIAVX_FLOAT(platform::avx, kEQ8);
-INTRIAVX_FLOAT(platform::avx, kGT8LT16);
-INTRIAVX_FLOAT(platform::avx, kEQ16);
-INTRIAVX_FLOAT(platform::avx, kGT16);
-INTRIAVX_FLOAT(platform::avx2, kEQ8);
-INTRIAVX_FLOAT(platform::avx2, kGT8LT16);
-INTRIAVX_FLOAT(platform::avx2, kEQ16);
-INTRIAVX_FLOAT(platform::avx2, kGT16);
-INTRIAVX_FLOAT(platform::avx512f, kEQ8);
-INTRIAVX_FLOAT(platform::avx512f, kGT8LT16);
-INTRIAVX_FLOAT(platform::avx512f, kEQ16);
-INTRIAVX_FLOAT(platform::avx512f, kGT16);
-#endif
-
-#undef INTRIAVX_FLOAT
-
-REGISTER_JITKERNEL_DEPRECATED(layer_norm, LayerNormKernel);
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h
deleted file mode 100644
index 4dba3b56810794cb4839d26386ae77a8f4507977..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-#define JITKERNEL_DECLARE_STATIC_FUNC                       \
-  static inline std::string name(int d) {                   \
-    PADDLE_THROW("DType should be either float or double"); \
-  }                                                         \
-  static inline bool useJIT(int d) { return false; }        \
-  static inline bool useMKL(int d) { return false; }
-
-#define JITKERNEL_DEFINE_NAME(ker_key, ker_class)    \
-  template <>                                        \
-  std::string ker_class##Impl<float>::name(int d) {  \
-    std::string key(#ker_key "f");                   \
-    if (useJIT(d)) {                                 \
-      /* only jit code need record d*/               \
-      return key + "jit" + std::to_string(d);        \
-    } else if (useMKL(d)) {                          \
-      return key + "mkl";                            \
-    } else {                                         \
-      return key + "any";                            \
-    }                                                \
-  }                                                  \
-  template <>                                        \
-  std::string ker_class##Impl<double>::name(int d) { \
-    std::string key(#ker_key "d");                   \
-    /* jit code do not support double yet*/          \
-    if (useMKL(d)) {                                 \
-      return key + "mkl";                            \
-    } else {                                         \
-      return key + "any";                            \
-    }                                                \
-  }
-
-#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
-  template <>                                   \
-  std::shared_ptr<const ker_class<ker_dtype>>   \
-  KernelPool::Get<ker_class<ker_dtype>, int>(int d)
-
-#define JITKERNEL_FIND_KEY(ker_class, ker_dtype) \
-  std::string key = ker_class##Impl<ker_dtype>::name(d)
-
-#define JITKERNEL_IMPL(ker_class, ker_dtype)           \
-  p = std::dynamic_pointer_cast<ker_class<ker_dtype>>( \
-      std::make_shared<ker_class##Impl<ker_dtype>>(d))
-
-#define REGISTER_JITKERNEL_WITH_DTYPE(ker_class, ker_dtype, marco_declare, \
-                                      macro_find_key, macro_impl)          \
-  marco_declare(ker_class, ker_dtype) {                                    \
-    macro_find_key(ker_class, ker_dtype);                                  \
-    if (kers_.find(key) == kers_.end()) {                                  \
-      std::shared_ptr<ker_class<ker_dtype>> p;                             \
-      macro_impl(ker_class, ker_dtype);                                    \
-      kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});           \
-      return p;                                                            \
-    }                                                                      \
-    return std::dynamic_pointer_cast<const ker_class<ker_dtype>>(          \
-        kers_.at(key));                                                    \
-  }
-
-#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name,     \
-                                marco_declare, macro_find_key, macro_impl) \
-  marco_define_name(ker_key, ker_class);                                   \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, marco_declare,           \
-                                macro_find_key, macro_impl);               \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, marco_declare,          \
-                                macro_find_key, macro_impl)
-
-#define REGISTER_JITKERNEL(ker_key, ker_class)                       \
-  REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \
-                          JITKERNEL_DECLARE, JITKERNEL_FIND_KEY,     \
-                          JITKERNEL_IMPL)
-
-// TODO(TJ): below defines are deprecated, would be remove recently
-#define SEARCH_BLOCK(macro_, ker, dtype, isa)              \
-  if (d < YMM_FLOAT_BLOCK) {                               \
-    macro_(ker, dtype, isa, kLT8);                         \
-  } else if (d == YMM_FLOAT_BLOCK) {                       \
-    macro_(ker, dtype, isa, kEQ8);                         \
-  } else if (d > YMM_FLOAT_BLOCK && d < ZMM_FLOAT_BLOCK) { \
-    macro_(ker, dtype, isa, kGT8LT16);                     \
-  } else if (d == ZMM_FLOAT_BLOCK) {                       \
-    macro_(ker, dtype, isa, kEQ16);                        \
-  } else {                                                 \
-    macro_(ker, dtype, isa, kGT16);                        \
-  }
-
-#define SEARCH_ISA_BLOCK(macro_, ker, dtype)             \
-  if (platform::MayIUse(platform::avx512f)) {            \
-    SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \
-  } else if (platform::MayIUse(platform::avx2)) {        \
-    SEARCH_BLOCK(macro_, ker, dtype, platform::avx2);    \
-  } else if (platform::MayIUse(platform::avx)) {         \
-    SEARCH_BLOCK(macro_, ker, dtype, platform::avx);     \
-  } else {                                               \
-    SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \
-  }
-
-#define JITKERNEL_KEY(ker_key, dtype_key) \
-  #ker_key #dtype_key + std::to_string(d)
-
-#define JITKERNEL_NEW_IMPL_DEPRECATED(ker, dtype, isa, k) \
-  p = std::dynamic_pointer_cast<ker<dtype>>(              \
-      std::make_shared<ker##Impl<dtype, isa, k>>(d))
-
-#define JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, ker_dtype,       \
-                                        dtype_key, marco_declare, macro_key, \
-                                        macro_impl)                          \
-  marco_declare(ker_class, ker_dtype) {                                      \
-    std::string key = macro_key(ker_key, dtype_key);                         \
-    if (kers_.find(key) == kers_.end()) {                                    \
-      std::shared_ptr<ker_class<ker_dtype>> p;                               \
-      SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype);                    \
-      kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});             \
-      return p;                                                              \
-    }                                                                        \
-    return std::dynamic_pointer_cast<const ker_class<ker_dtype>>(            \
-        kers_.at(key));                                                      \
-  }
-
-#define REGISTER_JITKERNEL_DEPRECATED(ker_key, ker_class)           \
-  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f,     \
-                                  JITKERNEL_DECLARE, JITKERNEL_KEY, \
-                                  JITKERNEL_NEW_IMPL_DEPRECATED);   \
-  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d,    \
-                                  JITKERNEL_DECLARE, JITKERNEL_KEY, \
-                                  JITKERNEL_NEW_IMPL_DEPRECATED)
-
-#define REGISTER_JITKERNEL_ARGS_DEPRECATED(ker_key, ker_class, marco_declare,  \
-                                           macro_key, macro_impl)              \
-  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f, marco_declare, \
-                                  macro_key, macro_impl);                      \
-  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d,               \
-                                  marco_declare, macro_key, macro_impl)
-
-#define FOR_EACH_ISA(macro_, block) \
-  macro_(platform::avx512f, block); \
-  macro_(platform::avx2, block);    \
-  macro_(platform::avx, block);     \
-  macro_(platform::isa_any, block)
-
-#define FOR_EACH_BLOCK(macro_, isa) \
-  macro_(isa, kLT8);                \
-  macro_(isa, kEQ8);                \
-  macro_(isa, kGT8LT16);            \
-  macro_(isa, kEQ16);               \
-  macro_(isa, kGT16)
-
-#define FOR_EACH_ISA_BLOCK(macro_)           \
-  FOR_EACH_BLOCK(macro_, platform::avx512f); \
-  FOR_EACH_BLOCK(macro_, platform::avx2);    \
-  FOR_EACH_BLOCK(macro_, platform::avx);     \
-  FOR_EACH_BLOCK(macro_, platform::isa_any)
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
deleted file mode 100644
index 2db3274a45610aedea385baf650b8efb42ac39d0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <string>
-#include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#include "paddle/fluid/operators/math/jit_kernel_refer.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/macros.h"
-
-#ifdef PADDLE_WITH_XBYAK
-#include "paddle/fluid/operators/math/jit_code.h"
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-/* LSTM JitKernel */
-template <typename T>
-class LSTMKernelImpl : public LSTMKernel<T> {
- public:
-  static inline std::string name(const lstm_attr_t& attr) {
-    PADDLE_THROW("DType should be either float or double");
-  }
-  static inline bool useJIT(int d) { return false; }
-  static inline bool useMKL(int d) { return false; }
-  explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(attr.d)) {
-      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;
-      jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
-      this->ComputeCtHt =
-          jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
-
-      jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096));
-      this->ComputeC1H1 =
-          jitcode1_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
-      return;
-    }
-#endif
-
-    this->ComputeCtHt = refer::LSTMCtHt<T>;
-    this->ComputeC1H1 = refer::LSTMC1H1<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::LSTMJitCode> jitcode0_{nullptr}, jitcode1_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool LSTMKernelImpl<float>::useJIT(int d) {
-  return gen::LSTMJitCode::init(d);
-}
-#endif
-
-/* Peephole JitKernel */
-template <typename T>
-class PeepholeKernelImpl : public LSTMKernel<T> {
- public:
-  static inline std::string name(const lstm_attr_t& attr) {
-    PADDLE_THROW("DType should be either float or double");
-  }
-  static inline bool useJIT(int d) { return false; }
-  static inline bool useMKL(int d) { return false; }
-  explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(attr.d)) {
-      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 4 * 8;
-      jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
-      this->ComputeCtHt =
-          jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
-
-      jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096));
-      this->ComputeC1H1 =
-          jitcode1_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
-      return;
-    }
-#endif
-
-    this->ComputeCtHt = refer::LSTMCtHt<T>;
-    this->ComputeC1H1 = refer::LSTMC1H1<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::LSTMJitCode> jitcode0_{nullptr}, jitcode1_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool PeepholeKernelImpl<float>::useJIT(int d) {
-  return gen::LSTMJitCode::init(d);
-}
-#endif
-
-#define JITKERNEL_DEFINE_NAME_LSTM(ker_key, ker_class)                 \
-  template <>                                                          \
-  std::string ker_class##Impl<float>::name(const lstm_attr_t& attr) {  \
-    std::string key(#ker_key "f");                                     \
-    key += (attr.act_gate + attr.act_cand + attr.act_cell +            \
-            (attr.use_peephole ? "p" : "n"));                          \
-    if (useJIT(attr.d)) {                                              \
-      /* only jit code need record d*/                                 \
-      return key + "jit" + std::to_string(attr.d);                     \
-    } else if (useMKL(attr.d)) {                                       \
-      return key + "mkl";                                              \
-    } else {                                                           \
-      return key + "any";                                              \
-    }                                                                  \
-  }                                                                    \
-  template <>                                                          \
-  std::string ker_class##Impl<double>::name(const lstm_attr_t& attr) { \
-    std::string key(#ker_key "d");                                     \
-    /* jit code do not support double yet*/                            \
-    if (useMKL(attr.d)) {                                              \
-      return key + "mkl";                                              \
-    } else {                                                           \
-      return key + "any";                                              \
-    }                                                                  \
-  }
-
-#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)          \
-  template <>                                                 \
-  std::shared_ptr<const LSTMKernel<ker_dtype>>                \
-  KernelPool::Get<LSTMKernel<ker_dtype>, const lstm_attr_t&>( \
-      const lstm_attr_t& attr)
-
-#define JITKERNEL_FIND_KEY_LSTM(ker_class, ker_dtype) \
-  std::string key = ker_class##Impl<ker_dtype>::name(attr)
-
-#define JITKERNEL_LSTM_IMPL(ker, dtype)                     \
-  if (attr.use_peephole) {                                  \
-    p = std::dynamic_pointer_cast<ker<dtype>>(              \
-        std::make_shared<PeepholeKernelImpl<dtype>>(attr)); \
-  } else {                                                  \
-    p = std::dynamic_pointer_cast<ker<dtype>>(              \
-        std::make_shared<ker##Impl<dtype>>(attr));          \
-  }
-
-REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM,
-                        JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM,
-                        JITKERNEL_LSTM_IMPL);
-
-#undef JITKERNEL_LSTM_IMPL
-#undef JITKERNEL_FIND_KEY_LSTM
-#undef JITKERNEL_DECLARE_LSTM
-#undef JITKERNEL_DEFINE_NAME_LSTM
-
-/* GRU JitKernel */
-template <typename T>
-class GRUKernelImpl : public GRUKernel<T> {
- public:
-  static inline std::string name(const gru_attr_t& attr) {
-    PADDLE_THROW("DType should be either float or double");
-  }
-  static inline bool useJIT(int d) { return false; }
-  static inline bool useMKL(int d) { return false; }
-  explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(attr.d)) {
-      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;
-      jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096));
-      this->ComputeH1 =
-          jitcode0_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
-
-      jitcode1_.reset(new gen::GRUJitCode(1, attr, sz > 4096 ? sz : 4096));
-      this->ComputeHtPart1 =
-          jitcode1_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
-
-      jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096));
-      this->ComputeHtPart2 =
-          jitcode2_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
-      return;
-    }
-#endif
-    this->ComputeH1 = refer::GRUH1<T>;
-    this->ComputeHtPart1 = refer::GRUHtPart1<T>;
-    this->ComputeHtPart2 = refer::GRUHtPart2<T>;
-  }
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::GRUJitCode> jitcode0_{nullptr}, jitcode1_{nullptr},
-      jitcode2_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool GRUKernelImpl<float>::useJIT(int d) {
-  return gen::GRUJitCode::init(d);
-}
-#endif
-
-#define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class)                 \
-  template <>                                                         \
-  std::string ker_class##Impl<float>::name(const gru_attr_t& attr) {  \
-    std::string key(#ker_key "f");                                    \
-    key += (attr.act_gate + attr.act_cand);                           \
-    if (useJIT(attr.d)) {                                             \
-      /* only jit code need record d*/                                \
-      return key + "jit" + std::to_string(attr.d);                    \
-    } else if (useMKL(attr.d)) {                                      \
-      return key + "mkl";                                             \
-    } else {                                                          \
-      return key + "any";                                             \
-    }                                                                 \
-  }                                                                   \
-  template <>                                                         \
-  std::string ker_class##Impl<double>::name(const gru_attr_t& attr) { \
-    std::string key(#ker_key "d");                                    \
-    /* jit code do not support double yet*/                           \
-    if (useMKL(attr.d)) {                                             \
-      return key + "mkl";                                             \
-    } else {                                                          \
-      return key + "any";                                             \
-    }                                                                 \
-  }
-
-#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype)         \
-  template <>                                               \
-  std::shared_ptr<const ker_class<ker_dtype>>               \
-  KernelPool::Get<ker_class<ker_dtype>, const gru_attr_t&>( \
-      const gru_attr_t& attr)
-
-#define JITKERNEL_FIND_KEY_GRU(ker_class, ker_dtype) \
-  std::string key = ker_class##Impl<ker_dtype>::name(attr)
-
-#define JITKERNEL_GRU_IMPL(ker, dtype)       \
-  p = std::dynamic_pointer_cast<ker<dtype>>( \
-      std::make_shared<ker##Impl<dtype>>(attr));
-
-REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DEFINE_NAME_GRU,
-                        JITKERNEL_DECLARE_GRU, JITKERNEL_FIND_KEY_GRU,
-                        JITKERNEL_GRU_IMPL);
-
-#undef JITKERNEL_GRU_IMPL
-#undef JITKERNEL_FIND_KEY_GRU
-#undef JITKERNEL_DECLARE_GRU
-#undef JITKERNEL_DEFINE_NAME_GRU
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
deleted file mode 100644
index 19f7bd8909499c12fd5bee4db0d0a71a632e7f19..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ /dev/null
@@ -1,742 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <cmath>    // for exp
-#include <cstring>  // for memcpy
-#include <random>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/jit_kernel_refer.h"
-#include "paddle/fluid/platform/port.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
-constexpr int repeat = 20000;
-
-// TODO(TJ): benchmark and test should be seperated,
-// benchmark should verify more sizes
-
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-
-template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
-               const T upper = static_cast<T>(20.f)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-#if defined __AVX__ || defined __AVX2__
-void vrelu_intri8(const int n, const float* x, float* y) {
-  __m256 tmp = _mm256_loadu_ps(x);
-  tmp = _mm256_max_ps(tmp, _mm256_setzero_ps());
-  _mm256_storeu_ps(y, tmp);
-}
-#endif
-
-TEST(JitKernel, vrelu) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
-    std::vector<float> x(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data(), -10.f, 1.f);
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VReluKernel<float>>(d);
-    const float* x_data = x.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VRelu<float>(x_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-#if defined __AVX__ || defined __AVX2__
-    if (d == 8) {
-      auto si0 = GetCurrentUS();
-      for (int i = 0; i < repeat; ++i) {
-        vrelu_intri8(d, x_data, zref_data);
-      }
-      auto si1 = GetCurrentUS();
-      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat << " us";
-    }
-#endif
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-TEST(JitKernel, vaddbias) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
-    std::vector<float> x(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data(), -2.f, 2.f);
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
-    const float a = 2.f;
-    const float* x_data = x.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VAddBias<float>(&a, x_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(&a, x_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-void vexp_mkl(const int n, const float* x, float* y) {
-  paddle::platform::dynload::vsExp(n, x, y);
-}
-#endif
-
-TEST(JitKernel, vexp) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
-    std::vector<float> x(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data(), -2.f, 2.f);
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
-    const float* x_data = x.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VExp<float>(x_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-
-#ifdef PADDLE_WITH_MKLML
-    auto tmkls = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      vexp_mkl(d, x_data, zref_data);
-    }
-    auto tmkle = GetCurrentUS();
-#endif
-
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      // ker->Compute(x_data, ztgt_data);
-      ker->Compute(x_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-#ifdef PADDLE_WITH_MKLML
-            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
-#else
-            << " us, "
-#endif
-
-            << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-void vsigmoid_better(
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp,
-    const int n, const float* x, float* y) {
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = 0.f - y[i];
-  }
-  vexp->Compute(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = 1.f / (1.f + y[i]);
-  }
-}
-
-TEST(JitKernel, vsigmoid) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
-    std::vector<float> x(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data(), -2.f, 2.f);
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
-    const auto& vexp =
-        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
-    const float* x_data = x.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto tmkls = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      vsigmoid_better(vexp, d, x_data, zref_data);
-    }
-    auto tmkle = GetCurrentUS();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VSigmoid<float>(x_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-void vtanh_better(
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VScalKernel<float>>& vscal,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
-        vsigmoid,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VAddBiasKernel<float>>&
-        vaddbias,
-    const int n, const float* x, float* y) {
-  const float a = 2.f, b = -1.f;
-  vscal->Compute(&a, x, y, n);
-  vsigmoid->Compute(y, y, n);
-  vscal->Compute(&a, y, y, n);
-  vaddbias->Compute(&b, y, y, n);
-}
-
-TEST(JitKernel, vtanh) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
-    std::vector<float> x(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data(), -2.f, 2.f);
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
-    const auto& vscal =
-        jit::KernelPool::Instance().template Get<jit::VScalKernel<float>>(d);
-    const auto& vsigmoid =
-        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
-    const auto& vaddbias =
-        jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
-    const float* x_data = x.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto tmkls = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      vtanh_better(vscal, vsigmoid, vaddbias, d, x_data, zref_data);
-    }
-    auto tmkle = GetCurrentUS();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VTanh<float>(x_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-void lstm_ctht_better(
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
-        vsigmoid_3d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VMulKernel<float>>& vmul_d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd_d,
-    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
-  int d2 = d * 2;
-  vsigmoid_3d->Compute(gates + d, gates + d, 3 * d);
-  vtanh_d->Compute(gates, gates, d);
-  vmul_d->Compute(gates, gates + d, gates + d, d);
-  vmul_d->Compute(ct_1, gates + d2, gates + d2, d);
-  vadd_d->Compute(gates + d, gates + d2, ct, d);
-  /* H_t = act_cell(C_t) * ogated */
-  vtanh_d->Compute(ct, gates + d2, d);
-  vmul_d->Compute(gates + d2, gates + d * 3, ht, d);
-}
-
-TEST(JitKernel, lstm) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
-    int d4 = d * 4;
-    int d3 = d * 3;
-    std::vector<float> x(d4), xref(d4);
-    std::vector<float> ct_1(d), ct_tgt(d), ht_tgt(d);
-    std::vector<float> ct_ref(d), ht_ref(d);
-    RandomVec<float>(d4, x.data(), -2.f, 2.f);
-    RandomVec<float>(d, ct_1.data(), -2.f, 2.f);
-    memcpy(xref.data(), x.data(), sizeof(float) * d4);
-    std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
-    const jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell, false);
-    const auto& ker =
-        jit::KernelPool::Instance()
-            .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(
-                attr);
-    // below kernels are used to compute refer
-    const auto& vsigmoid_3d =
-        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(
-            d3);
-    const auto& vtanh_d =
-        jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
-    const auto& vmul_d =
-        jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
-    const auto& vadd_d =
-        jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
-
-    float* x_data = x.data();
-    float* xref_data = xref.data();
-    const float* ct_1_data = ct_1.data();
-    float* ct_tgt_data = ct_tgt.data();
-    float* ht_tgt_data = ht_tgt.data();
-    float* ct_ref_data = ct_ref.data();
-    float* ht_ref_data = ht_ref.data();
-    // compute once to check correctness
-    jit::lstm_t step;
-    step.gates = xref_data;
-    step.ct_1 = ct_1_data;
-    step.ct = ct_ref_data;
-    step.ht = ht_ref_data;
-    refer::LSTMCtHt<float>(&step, &attr);
-
-    step.gates = x_data;
-    step.ct = ct_tgt_data;
-    step.ht = ht_tgt_data;
-    ker->ComputeCtHt(&step, &attr);
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3);
-      EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3);
-    }
-
-    auto tmkls = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      lstm_ctht_better(vsigmoid_3d, vtanh_d, vmul_d, vadd_d, d, xref_data,
-                       ct_1_data, ct_ref_data, ht_ref_data);
-    }
-    auto tmkle = GetCurrentUS();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::LSTMCtHt<float>(&step, &attr);
-    }
-    auto trefe = GetCurrentUS();
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->ComputeCtHt(&step, &attr);
-    }
-    auto ttgte = GetCurrentUS();
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
-            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
-  }
-}
-
-#if defined __AVX__ || defined __AVX2__
-void vscal_intri8(const int n, const float a, const float* x, float* y) {
-  __m256 tmp;
-  __m256 scalar = _mm256_set1_ps(a);
-  tmp = _mm256_loadu_ps(x);
-  tmp = _mm256_mul_ps(tmp, scalar);
-  _mm256_storeu_ps(y, tmp);
-}
-void vscal_inp_intri8(const int n, const float a, float* x) {
-  __m256 tmp;
-  __m256 scalar = _mm256_set1_ps(a);
-  tmp = _mm256_loadu_ps(x);
-  tmp = _mm256_mul_ps(tmp, scalar);
-  _mm256_storeu_ps(x, tmp);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-void vscal_inp_mkl(const int n, const float a, float* x) {
-  paddle::platform::dynload::cblas_sscal(n, a, x, 1);
-}
-#endif
-
-TEST(JitKernel, vscal) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
-    std::vector<float> x(d), y(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data());
-    std::memcpy(y.data(), x.data(), sizeof(float) * d);
-    float a = 2.f;
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VScalKernel<float>>(d);
-    const float* x_data = x.data();
-    float* y_data = y.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VScal<float>(&a, x_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-    auto trefs1 = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VScal<float>(&a, y_data, y_data, d);
-    }
-    auto trefe1 = GetCurrentUS();
-
-#ifdef PADDLE_WITH_MKLML
-    auto tmkls = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      vscal_inp_mkl(d, a, y_data);
-    }
-    auto tmkle = GetCurrentUS();
-#endif
-
-#if defined __AVX__ || defined __AVX2__
-    if (d == 8) {
-      auto si0 = GetCurrentUS();
-      for (int i = 0; i < repeat; ++i) {
-        vscal_intri8(d, a, x_data, zref_data);
-      }
-      auto si1 = GetCurrentUS();
-      auto si2 = GetCurrentUS();
-      for (int i = 0; i < repeat; ++i) {
-        vscal_inp_intri8(d, a, y_data);
-      }
-      auto si3 = GetCurrentUS();
-      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
-              << " us, inplace: " << (si3 - si2) / repeat << " us";
-    }
-#endif
-
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(&a, x_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-    auto ttgts1 = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(&a, y_data, y_data, d);
-    }
-    auto ttgte1 = GetCurrentUS();
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, inplace takes: " << (trefe1 - trefs1) / repeat
-#ifdef PADDLE_WITH_MKLML
-            << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, "
-#else
-            << " us, "
-#endif
-            << "tgt takes: " << (ttgte - ttgts) / repeat
-            << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-#if defined __AVX__ || defined __AVX2__
-void vmul_intri8(const int n, const float* x, const float* y, float* z) {
-  __m256 tmpx, tmpy;
-  tmpx = _mm256_loadu_ps(x);
-  tmpy = _mm256_loadu_ps(y);
-  tmpx = _mm256_mul_ps(tmpx, tmpy);
-  _mm256_storeu_ps(z, tmpx);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-void vmul_mkl(const int n, const float* x, const float* y, float* z) {
-  paddle::platform::dynload::vsMul(n, x, y, z);
-}
-#endif
-
-TEST(JitKernel, vmul) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {7, 8, 15, 16, 20, 30, 256, 512, 1000, 1024}) {
-    std::vector<float> x(d), y(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data());
-    RandomVec<float>(d, y.data());
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
-    const float* x_data = x.data();
-    const float* y_data = y.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VMul<float>(x_data, y_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-
-#ifdef PADDLE_WITH_MKLML
-    auto tmkls = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      vmul_mkl(d, x_data, y_data, zref_data);
-    }
-    auto tmkle = GetCurrentUS();
-#endif
-
-#if defined __AVX__ || defined __AVX2__
-    if (d == 8) {
-      auto si0 = GetCurrentUS();
-      for (int i = 0; i < repeat; ++i) {
-        vmul_intri8(d, x_data, y_data, zref_data);
-      }
-      auto si1 = GetCurrentUS();
-      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
-    }
-#endif
-
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, y_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-#ifdef PADDLE_WITH_MKLML
-            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
-#else
-            << " us, "
-#endif
-            << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-#if defined __AVX__ || defined __AVX2__
-void vadd_intri8(const int n, const float* x, const float* y, float* z) {
-  __m256 tmpx, tmpy;
-  tmpx = _mm256_loadu_ps(x);
-  tmpy = _mm256_loadu_ps(y);
-  tmpx = _mm256_add_ps(tmpx, tmpy);
-  _mm256_storeu_ps(z, tmpx);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-void vadd_mkl(const int n, const float* x, const float* y, float* z) {
-  paddle::platform::dynload::vsAdd(n, x, y, z);
-}
-#endif
-
-TEST(JitKernel, vadd) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
-    std::vector<float> x(d), y(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data());
-    RandomVec<float>(d, y.data());
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
-    const float* x_data = x.data();
-    const float* y_data = y.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VAdd<float>(x_data, y_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-
-#ifdef PADDLE_WITH_MKLML
-    auto tmkls = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      vadd_mkl(d, x_data, y_data, zref_data);
-    }
-    auto tmkle = GetCurrentUS();
-#endif
-
-#if defined __AVX__ || defined __AVX2__
-    if (d == 8) {
-      auto si0 = GetCurrentUS();
-      for (int i = 0; i < repeat; ++i) {
-        vadd_intri8(d, x_data, y_data, zref_data);
-      }
-      auto si1 = GetCurrentUS();
-      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
-    }
-#endif
-
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, y_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-#ifdef PADDLE_WITH_MKLML
-            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
-#else
-            << " us, "
-#endif
-            << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-void vaddrelu_better(
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
-    const float* x, const float* y, float* z, int d) {
-  vadd->Compute(x, y, z, d);
-  vrelu->Compute(z, z, d);
-}
-
-TEST(JitKernel, vaddrelu) {
-  namespace jit = paddle::operators::math::jitkernel;
-  namespace refer = paddle::operators::math::jitkernel::refer;
-  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
-    std::vector<float> x(d), y(d);
-    std::vector<float> zref(d), ztgt(d);
-    RandomVec<float>(d, x.data());
-    RandomVec<float>(d, y.data());
-    const auto& ker =
-        jit::KernelPool::Instance().template Get<jit::VAddReluKernel<float>>(d);
-    const auto& vadd =
-        jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
-    const auto& vrelu =
-        jit::KernelPool::Instance().template Get<jit::VReluKernel<float>>(d);
-    const float* x_data = x.data();
-    const float* y_data = y.data();
-    float* ztgt_data = ztgt.data();
-    float* zref_data = zref.data();
-    auto trefs = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      refer::VAddRelu<float>(x_data, y_data, zref_data, d);
-    }
-    auto trefe = GetCurrentUS();
-    auto tmkls = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data, d);
-    }
-    auto tmkle = GetCurrentUS();
-    auto ttgts = GetCurrentUS();
-    for (int i = 0; i < repeat; ++i) {
-      ker->Compute(x_data, y_data, ztgt_data, d);
-    }
-    auto ttgte = GetCurrentUS();
-    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
-            << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
-            << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
-    for (int i = 0; i < d; ++i) {
-      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-    }
-  }
-}
-
-TEST(JitKernel, pool) {
-  namespace jit = paddle::operators::math::jitkernel;
-  const int frame_size = 4;
-  std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
-  jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);
-
-  // empty call it to avoid unknown flag 'use_pinned_memory' on Mac
-  paddle::platform::MayIUse(paddle::platform::avx);
-  const auto& plstm1 =
-      jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
-
-  const auto& plstm2 =
-      jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
-  EXPECT_EQ(plstm1, plstm2);
-
-  const auto& peephole =
-      jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(
-              jit::lstm_attr_t(frame_size, act_gate, act_cand, act_cell, true));
-  EXPECT_TRUE(plstm1 != peephole);
-
-  const auto& pvmul_f =
-      jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(4);
-  EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(plstm2) !=
-              std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f));
-
-  const auto& pvmul_d =
-      jit::KernelPool::Instance().template Get<jit::VMulKernel<double>>(4);
-  EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f) !=
-              std::dynamic_pointer_cast<const jit::Kernel>(pvmul_d));
-
-  const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulfjit4");
-#if defined(__APPLE__) || defined(__OSX__) || defined(_WIN32)
-  EXPECT_EQ(pvmul_from_key, nullptr);
-#else
-  EXPECT_EQ(pvmul_from_key, pvmul_f);
-#endif
-  const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulfjit");
-  EXPECT_TRUE(pvmul_from_key2 == nullptr);
-}
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 3eba268cfa9712e4bc5475dd44076bc768552bce..1a11b584e2bab7eeb395bf391da080ec0ba62ae4 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
 #include <set>
 #include <unordered_map>
 
@@ -252,23 +253,26 @@ elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
 template <typename T>
 struct MergeAdd<platform::CPUDeviceContext, T> {
   framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input) {
+                                     const framework::SelectedRows& input,
+                                     const bool sorted_result = false) {
     framework::SelectedRows out;
-    (*this)(context, input, &out);
+    (*this)(context, input, &out, sorted_result);
     return out;
   }
 
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input,
-                  framework::SelectedRows* output) {
+                  framework::SelectedRows* output,
+                  const bool sorted_result = false) {
     std::vector<const framework::SelectedRows*> inputs;
     inputs.push_back(&input);
-    (*this)(context, inputs, output);
+    (*this)(context, inputs, output, sorted_result);
   }
 
   void operator()(const platform::CPUDeviceContext& context,
                   const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output) {
+                  framework::SelectedRows* output,
+                  const bool sorted_result = false) {
     if (inputs.size() == 0) {
       VLOG(3) << "no input! return";
       return;
@@ -301,6 +305,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
     }
     std::vector<int64_t> merge_rows(merged_row_set.begin(),
                                     merged_row_set.end());
+    if (sorted_result) {
+      std::sort(merge_rows.begin(), merge_rows.end());
+    }
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
       rows_to_id[merge_rows[i]] = i;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index c4fccdbf862fda8a599869c30ae598573ca367aa..0d63f641c8670f8629c52b9e5fc380a250d80dd7 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -266,7 +266,8 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
 template <typename T>
 struct MergeAdd<platform::CUDADeviceContext, T> {
   framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
-                                     const framework::SelectedRows& input) {
+                                     const framework::SelectedRows& input,
+                                     const bool sorted_result = false) {
     framework::SelectedRows out;
     (*this)(context, input, &out);
     return out;
@@ -274,7 +275,8 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
 
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input,
-                  framework::SelectedRows* output) {
+                  framework::SelectedRows* output,
+                  const bool sorted_result = false) {
     framework::Vector<int64_t> input_rows(input.rows());
     if (input_rows.size() == 0) {
       return;
@@ -312,7 +314,8 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
 
   void operator()(const platform::CUDADeviceContext& context,
                   const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output) {
+                  framework::SelectedRows* output,
+                  const bool sorted_result = false) {
     if (inputs.size() == 0) {
       VLOG(3) << "no input! return";
       return;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 6d146d39d6d07678e859b82b25ba60ed7661546d..222d761ef91d8aee4843d717dabba7edf131f8dc 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -81,13 +81,16 @@ struct MergeAdd {
   // unary functor, merge by adding duplicated rows in
   // the input SelectedRows object.
   framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input);
+                                     const framework::SelectedRows& input,
+                                     const bool sorted_result = false);
   void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input,
-                  framework::SelectedRows* output);
+                  framework::SelectedRows* output,
+                  const bool sorted_result = false);
   void operator()(const DeviceContext& context,
                   const std::vector<const framework::SelectedRows*>& inputs,
-                  framework::SelectedRows* output);
+                  framework::SelectedRows* output,
+                  const bool sorted_result = false);
 };
 
 enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 83bb4dde46fa241affad3788e3381b6ecd8aa098..08088eb8733f28f0dc8ecade2aa4b70342244b0a 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -92,8 +92,8 @@ template <typename T>
 class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& place = *dev_ctx.eigen_device();
     // get input and output tensor
     auto* predictions = ctx.Input<Tensor>("Predictions");
     auto* labels = ctx.Input<Tensor>("Labels");
@@ -115,11 +115,11 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
     auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
 
-    // Temporary tensor
-    Tensor ious;
-    float* ious_data = ious.mutable_data<float>(
-        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
-    auto ious_t = EigenTensor<float, 1>::From(ious);
+    // Temporary memory
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto tmp_ious_data = allocator.Allocate(num_classes * sizeof(float));
+    float* ious_data = static_cast<float*>(tmp_ious_data->ptr());
 
     // Init out_wrong, out_correct and out_mean_iou
     out_wrong_t.device(place) = out_wrong_t.constant(0);
@@ -148,7 +148,7 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
         num_classes, predictions->numel(), predictions_data, labels_data,
         out_wrong_data, out_correct_data);
-    ctx.device_context().Wait();
+
     ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
                                                   out_correct_data, ious_data,
                                                   out_mean_iou_data);
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc
index 3c15c839554599104d21a5225c078d41735c4a60..50f44c7fc5ec90420d7c38f0f536ff7adb8f9ec4 100644
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
@@ -26,6 +26,13 @@ class MergeSelectedRowsOp : public framework::OperatorWithKernel {
                    "Input(X) of MergeSelectedRowsOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of MergeSelectedRowsOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X").front(),
+                      framework::proto::VarType::SELECTED_ROWS,
+                      "Input X only should be SelectedRows.");
+    PADDLE_ENFORCE_EQ(ctx->GetOutputsVarType("Out").front(),
+                      framework::proto::VarType::SELECTED_ROWS,
+                      "Output Y only should be SelectedRows.");
+
     ctx->ShareDim("X", /*->*/ "Out");
   }
 };
@@ -43,7 +50,28 @@ class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
         R"DOC(
 MergeSelectedRows Operator.
 
-MergeSelectedRows is used to merge the duplicated rows of the input.
+MergeSelectedRows is used to merge the duplicated rows of the input. The
+output's row has no duplicated, and it's order is incremental.
+
+Example:
+  Input:
+    X.rows is [0, 5, 5, 4, 19]
+    X.height is 20
+    X.value is:
+        [[1, 1]
+         [2, 2]
+         [3, 3]
+         [4, 4]
+         [6, 6]]
+
+   Output:
+    Out.row is [0, 4, 5, 19]
+    Out.height is 20
+    Out.value is:
+        [[1, 1]
+         [4, 4]
+         [5, 5]
+         [6, 6]]
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
index 0ed77ff5577cf4f45a8865db9b42e8bda9839478..8e7457dd56c2413f84008ce467537e07b3e80cc7 100644
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -22,4 +22,6 @@ limitations under the License. */
 #pragma once
 
 #include "ops/binary_unnary_op.h"
+#include "ops/fill_constant_op.h"
 #include "ops/mul_op.h"
+#include "ops/top_k_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
index 4e2f5e231c16cd0fad6db287aa19430c56b534fd..6610380fcf432d0019f7e844fa9304e151b20efd 100644
--- a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
@@ -45,7 +45,6 @@ static void BuildUnaryNode(
   auto out = std::make_shared<T>(input);
   paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
 }
-
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5eff69e7b165fa19c775926914b7b3e8fcb043e5
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -0,0 +1,61 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <string>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildFillConstantNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto vsp = op_attrs.Get<std::vector<int64_t>>("shape");
+  ngraph::Shape shape;
+  for (auto& sp : vsp) {
+    shape.push_back(sp);
+  }
+  float value = op_attrs.Get<float>("value");
+  ngraph::element::Type ng_dtype;
+  auto data_type = static_cast<paddle::framework::proto::VarType::Type>(
+      op_attrs.Get<int>("dtype"));
+  if (data_type == paddle::framework::proto::VarType::FP32) {
+    ng_dtype = ngraph::element::f32;
+  } else if (data_type == paddle::framework::proto::VarType::FP64) {
+    ng_dtype = ngraph::element::f64;
+  } else if (data_type == paddle::framework::proto::VarType::INT64) {
+    ng_dtype = ngraph::element::i64;
+  } else if (data_type == paddle::framework::proto::VarType::INT32) {
+    ng_dtype = ngraph::element::i32;
+  } else if (data_type == paddle::framework::proto::VarType::BOOL) {
+    ng_dtype = ngraph::element::boolean;
+  } else {
+    PADDLE_THROW("unsupported data type: %s", data_type);
+  }
+  auto out = ngraph::op::Constant::create(ng_dtype, shape, {value});
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b7254497c0e1aab2e653e69e6461f262b929703
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
@@ -0,0 +1,51 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildTopKNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  int k = op_attrs.Get<int>("k");
+  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto top_k = std::make_shared<ngraph::op::TopK>(
+      input, input->get_shape().size() - 1, ngraph::element::i64, k);
+  std::shared_ptr<ngraph::Node> indices =
+      std::make_shared<ngraph::op::GetOutputElement>(top_k, 0);
+  std::shared_ptr<ngraph::Node> out =
+      std::make_shared<ngraph::op::GetOutputElement>(top_k, 1);
+  auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
+  if (dummy_out && dummy_out->get_element_type() != out->get_element_type()) {
+    out = std::make_shared<ngraph::op::Convert>(out,
+                                                dummy_out->get_element_type());
+  }
+  paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map);
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 43d3ea4befd4ef79134e84f029b6fcb36e159c86..5c559484ec95e794ebbbe0e713cb9e26b5c01b98 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -157,8 +157,11 @@ struct AdamFunctor<T, CPUAdam> {
   }
 };
 
+template <typename T, typename Flavour>
+struct SparseAdamFunctor;
+
 template <typename T>
-struct SparseAdamFunctor {
+struct SparseAdamFunctor<T, GPUAdam> {
   T beta1_;
   T beta2_;
   T epsilon_;
@@ -236,6 +239,106 @@ struct SparseAdamFunctor {
   }
 };
 
+template <typename T>
+struct SparseAdamFunctor<T, CPUAdam> {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  const int64_t* rows_;
+  int64_t row_numel_;
+  int64_t row_count_;
+
+  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+                    const T* beta2_pow, const T* mom1, T* mom1_out,
+                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
+                    const T* param, T* param_out, const int64_t* rows,
+                    int64_t row_numel, int64_t row_count, bool lazy_mode)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_count_(row_count) {}
+
+  inline HOSTDEVICE void adam_update(size_t i, T g) const {
+    // The following code is the same as dense
+    T mom1 = moment1_[i];
+    T mom2 = moment2_[i];
+    T lr = *lr_;
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    T p = param_[i];
+
+    // Calculation
+    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+
+    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+    // Write back to global memory
+    moment1_out_[i] = mom1;
+    moment2_out_[i] = mom2;
+    param_out_[i] = p;
+  }
+
+  inline void operator()(size_t numel) const {
+    // lr could be reuse
+    T lr = *lr_;
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+    size_t row_count = numel / row_numel_;
+
+    for (size_t i = 0U, j = 0U; i != row_count; ++i) {
+      if (i == *(rows_ + j)) {
+        for (size_t k = 0U; k != row_numel_; ++k) {
+          T g = grad_[j * row_numel_ + k];
+          adam_update(i * row_numel_ + k, g);
+        }
+        ++j;
+      } else {
+        for (size_t k = 0U; k != row_numel_; ++k) {
+          T mom1 = moment1_[i * row_numel_ + k];
+          T mom2 = moment2_[i * row_numel_ + k];
+          T p = param_[i * row_numel_ + k];
+
+          mom1 = beta1_ * mom1;
+          mom2 = beta2_ * mom2;
+
+          p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+          // Write back to global memory
+          moment1_out_[i * row_numel_ + k] = mom1;
+          moment2_out_[i * row_numel_ + k] = mom2;
+          param_out_[i * row_numel_ + k] = p;
+        }
+      }
+    }
+  }
+};
+
 template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
  public:
@@ -332,7 +435,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
                                    .Var()
                                    ->GetMutable<framework::SelectedRows>();
         merge_func(ctx.template device_context<DeviceContext>(), grad,
-                   grad_merge_var);
+                   grad_merge_var, true);
         grad_merge_ptr = grad_merge_var;
       }
 
@@ -348,32 +451,46 @@ class AdamOpKernel : public framework::OpKernel<T> {
       } else {
 #endif
         rows = grad_merge.rows().data();
-
 #if defined(PADDLE_WITH_CUDA)
       }
 #endif
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
-      SparseAdamFunctor<T> functor(
-          beta1, beta2, epsilon, beta1_pow.template data<T>(),
-          beta2_pow.template data<T>(), mom1.template data<T>(),
-          mom1_out.template mutable_data<T>(ctx.GetPlace()),
-          mom2.template data<T>(),
-          mom2_out.template mutable_data<T>(ctx.GetPlace()),
-          lr.template data<T>(), grad_data, param.template data<T>(),
-          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
-          grad_merge.rows().size(), lazy_mode);
-      VLOG(3) << "lazy_mode :" << lazy_mode;
-      if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) {
-        size_t row_count = grad_merge.rows().size();
-        std::vector<int64_t> cpu_rows(grad_merge.rows());
-        for (size_t row_index = 0; row_index < row_count; ++row_index) {
-          for (size_t offset = 0; offset < row_numel; ++offset) {
-            size_t i = cpu_rows[row_index] * row_numel + offset;
-            functor.adam_update(i, grad_data[row_index * row_numel + offset]);
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        SparseAdamFunctor<T, CPUAdam> functor(
+            beta1, beta2, epsilon, beta1_pow.template data<T>(),
+            beta2_pow.template data<T>(), mom1.template data<T>(),
+            mom1_out.template mutable_data<T>(ctx.GetPlace()),
+            mom2.template data<T>(),
+            mom2_out.template mutable_data<T>(ctx.GetPlace()),
+            lr.template data<T>(), grad_data, param.template data<T>(),
+            param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
+            grad_merge.rows().size(), lazy_mode);
+
+        if (lazy_mode) {
+          size_t row_count = grad_merge.rows().size();
+          std::vector<int64_t> cpu_rows(grad_merge.rows());
+          for (size_t row_index = 0; row_index < row_count; ++row_index) {
+            for (size_t offset = 0; offset < row_numel; ++offset) {
+              size_t i = cpu_rows[row_index] * row_numel + offset;
+              functor.adam_update(i, grad_data[row_index * row_numel + offset]);
+            }
           }
+        } else {
+          functor(param.numel());
         }
-      } else {
+      } else if (platform::is_gpu_place(ctx.GetPlace())) {
+        SparseAdamFunctor<T, GPUAdam> functor(
+            beta1, beta2, epsilon, beta1_pow.template data<T>(),
+            beta2_pow.template data<T>(), mom1.template data<T>(),
+            mom1_out.template mutable_data<T>(ctx.GetPlace()),
+            mom2.template data<T>(),
+            mom2_out.template mutable_data<T>(ctx.GetPlace()),
+            lr.template data<T>(), grad_data, param.template data<T>(),
+            param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
+            grad_merge.rows().size(), lazy_mode);
+
+        // FIXME(minqiyang): remove BinarySearch in GPU later
         platform::ForRange<DeviceContext> for_range(
             static_cast<const DeviceContext&>(ctx.device_context()),
             param.numel());
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6b1c738af1daff5e3e4b1ac8e537de5adc93b76
--- /dev/null
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -0,0 +1,313 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/py_func_op.h"
+#include <set>
+#include <string>
+#include <vector>
+#include "Python.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+namespace py = ::pybind11;
+
+static std::vector<py::object> g_py_callables;
+
+const char kForwardPythonCallableId[] = "forward_callable_id";
+const char kBackwardPythonCallableId[] = "backward_callable_id";
+const char kPyFuncBackwardSkipVars[] = "backward_skip_vars";
+
+size_t AppendPythonCallableObjectAndReturnId(const py::object &py_obj) {
+  g_py_callables.emplace_back(py_obj);
+  return g_py_callables.size() - 1;
+}
+
+// Return py::object* instead of py::object
+// Returning py::object would cause reference count increasing
+// but without GIL, reference count in Python may not be safe
+static py::object *GetPythonCallableObject(size_t i) {
+  PADDLE_ENFORCE_LT(i, g_py_callables.size(), "Invalid python callable id");
+  return &g_py_callables[i];
+}
+
+static std::string PythonFuncDebugString(const py::object &py_callable) {
+  py::gil_scoped_acquire guard;
+  std::string wrapper_func_str = py::str(py_callable);
+  auto inner_func = py_callable.attr("_func");
+  std::string inner_func_str = py::str(inner_func);
+  return inner_func_str + " wrapped by " + wrapper_func_str;
+}
+
+static void CallPythonFunc(py::object *callable,
+                           const std::vector<framework::LoDTensor> &ins,
+                           std::vector<framework::LoDTensor *> *outs) {
+  py::gil_scoped_acquire guard;
+  py::tuple in_args(ins.size());
+  for (size_t i = 0; i < ins.size(); ++i) {
+    in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr);
+  }
+
+  auto ret = (*callable)(*in_args);
+  auto ret_tuple = py::cast<py::tuple>(ret);
+  size_t ret_num = py::len(ret_tuple);
+  size_t out_num = outs->size();
+  if (UNLIKELY(ret_num != out_num)) {
+    // Python function has no return values or returns None
+    // In this case, ret_num = 1 && ret[0] == None && out_num should be 0
+    // Otherwise, ret_num must be equal to out_num
+    PADDLE_ENFORCE(
+        ret_num == 1 && out_num == 0 &&
+            py::cast<framework::LoDTensor *>(ret_tuple[0]) == nullptr,
+        "Output number not match. Expected %d, actual %d", out_num, ret_num);
+  }
+
+  for (size_t i = 0; i < out_num; ++i) {
+    auto *out = (*outs)[i];
+    if (out == nullptr) {
+      continue;
+    }
+    try {
+      auto *py_out_tensor = py::cast<framework::LoDTensor *>(ret_tuple[i]);
+      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
+                              "Output tensor %d should not be nullptr", i);
+      out->set_lod(py_out_tensor->lod());
+      out->ShareDataWith(*py_out_tensor);
+    } catch (py::cast_error &) {
+      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+    }
+  }
+}
+
+class PyFuncOpVarTypInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op,
+                  framework::BlockDesc *block) const override {
+    auto &outs = op.Outputs();
+    bool has_out = (outs.count("Out") > 0 && !outs.at("Out").empty());
+
+    auto &ins = op.Inputs();
+    bool has_in = (ins.count("X") > 0 && !ins.at("X").empty());
+
+    /**
+     * X or Out can be empty, so that py_func can be more flexible
+     * to support Python functions with no input or no output
+     */
+    PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist");
+
+    PADDLE_ENFORCE_GE(boost::get<int>(op.GetAttr(kForwardPythonCallableId)), 0,
+                      "Function id cannot be less than 0");
+
+    if (!has_out) return;
+
+    /**
+     * Traverse all outputs, check if name of any output ends with @GRAD.
+     * If found, set its shape, dtype, lod_level, type to be the same as
+     * the corresponding forward variable
+     */
+    const std::string kGradVarSuffix = framework::kGradVarSuffix;
+    auto &out_var_names = outs.at("Out");
+    for (auto &out_var_name : out_var_names) {
+      if (out_var_name == framework::kEmptyVarName ||
+          out_var_name.size() < kGradVarSuffix.size()) {
+        continue;
+      }
+
+      size_t len = out_var_name.size() - kGradVarSuffix.size();
+      if (out_var_name.substr(len) == kGradVarSuffix) {
+        auto fwd_var_name = out_var_name.substr(0, len);
+        auto *out_var_desc = block->FindVarRecursive(out_var_name);
+        auto *fwd_var_desc = block->FindVarRecursive(fwd_var_name);
+        PADDLE_ENFORCE_NOT_NULL(out_var_desc, "Backward variable %s not found",
+                                out_var_name);
+        PADDLE_ENFORCE_NOT_NULL(fwd_var_desc, "Forward variable %s not found",
+                                fwd_var_name);
+        VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input("
+                 << fwd_var_name << ")";
+        out_var_desc->SetShape(fwd_var_desc->GetShape());
+        out_var_desc->SetDataType(fwd_var_desc->GetDataType());
+        out_var_desc->SetLoDLevel(fwd_var_desc->GetLoDLevel());
+        out_var_desc->SetType(fwd_var_desc->GetType());
+      }
+    }
+  }
+};
+
+class PyFuncOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(!ctx->IsRuntime(),
+                   "Infer shape cannot be called in runtime.");
+  }
+};
+
+class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Inputs of py_func op.").AsDuplicable();
+    AddOutput("Out", "Outputs of py_func op").AsDuplicable();
+    AddAttr<int>(kForwardPythonCallableId,
+                 "Index of registered forward Python function.")
+        .SetDefault(0);
+    AddAttr<int>(kBackwardPythonCallableId,
+                 "Index of registered backward Python function.")
+        .SetDefault(-1);
+    AddAttr<std::vector<std::string>>(kPyFuncBackwardSkipVars,
+                                      "Unused forward in/out in backward op")
+        .SetDefault(std::vector<std::string>());
+    AddComment(R"DOC("PyFunc Op")DOC");
+  }
+};
+
+/**
+ * There are several benefits when backward op of py_func op is
+ * still py_func op.
+ *
+ *  - Less codes are needed, since codes of backward is almost
+ *    the same as forward.
+ *
+ *  - To support high order derivative, so that py_func is
+ *    infinite-order differentiable
+ */
+class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase {
+ private:
+  static std::string DebugString(const std::vector<std::string> &strs) {
+    if (strs.empty()) return "";
+    std::string ret = strs[0];
+    for (size_t i = 1; i < strs.size(); ++i) {
+      ret += " ";
+      ret += strs[i];
+    }
+    return ret;
+  }
+
+ public:
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    auto &fwd_attrs = Attrs();
+    // no backward op when backward_id is less than 0
+    if (boost::get<int>(fwd_attrs.at(kBackwardPythonCallableId)) < 0) {
+      return {};
+    }
+
+    std::unique_ptr<framework::OpDesc> grad_op(new framework::OpDesc());
+    grad_op->SetType("py_func");
+
+    framework::AttributeMap bwd_attrs;
+    bwd_attrs[kForwardPythonCallableId] =
+        fwd_attrs.at(kBackwardPythonCallableId);
+    bwd_attrs[kBackwardPythonCallableId] = -1;
+    grad_op->SetAttrMap(bwd_attrs);
+
+    // All forward inputs
+    auto fwd_ins = Input("X");
+    // All forward outputs
+    auto fwd_outs = Output("Out");
+
+    // For memory reused, some inputs/output in forward part may be not needed
+    // in backward part. Skipping these vars helps to save memory
+    auto &backward_skip_var_list = boost::get<std::vector<std::string>>(
+        fwd_attrs.at(kPyFuncBackwardSkipVars));
+    std::unordered_set<std::string> backward_skip_var_set(
+        backward_skip_var_list.begin(), backward_skip_var_list.end());
+    std::vector<std::string> bwd_ins;
+    bwd_ins.reserve(fwd_ins.size() + fwd_outs.size());
+    for (auto &fwd_in : fwd_ins) {
+      if (backward_skip_var_set.count(fwd_in) == 0) {
+        bwd_ins.emplace_back(fwd_in);
+      }
+    }
+
+    for (auto &fwd_out : fwd_outs) {
+      if (backward_skip_var_set.count(fwd_out) == 0) {
+        bwd_ins.emplace_back(fwd_out);
+      }
+    }
+
+    // Backward OG cannot be skipped
+    // But in Python side, if OG is kEmptyVarName, input tensor would be None
+    auto fwd_out_grads = OutputGrad("Out");
+    bwd_ins.reserve(bwd_ins.size() + fwd_out_grads.size());
+    bwd_ins.insert(bwd_ins.end(), fwd_out_grads.begin(), fwd_out_grads.end());
+
+    // Backward IG cannot be skipped
+    // But in Python side, if IG is not needed, users can just return None
+    auto bwd_outs = InputGrad("X", false);
+
+    VLOG(10) << "PyFunc Grad Input: " << DebugString(bwd_ins);
+    VLOG(10) << "PyFunc Grad Output: " << DebugString(bwd_outs);
+
+    grad_op->SetInput("X", bwd_ins);
+    grad_op->SetOutput("Out", bwd_outs);
+
+    std::vector<std::unique_ptr<framework::OpDesc>> ret(1);
+    ret[0] = std::move(grad_op);
+    return ret;
+  }
+};
+
+class PyFuncOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ protected:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &in_arg_names = Inputs("X");
+    auto &out_arg_names = Outputs("Out");
+
+    std::vector<framework::LoDTensor> inputs(in_arg_names.size());
+    for (size_t i = 0; i < in_arg_names.size(); ++i) {
+      auto in_var = scope.FindVar(in_arg_names[i]);
+      // When py_func op is called in backward, in_var may be null
+      if (in_var == nullptr) {
+        continue;
+      }
+      auto &in_tensor = in_var->Get<framework::LoDTensor>();
+      if (!in_tensor.IsInitialized()) {
+        continue;
+      }
+      if (platform::is_gpu_place(in_tensor.place())) {
+        framework::TensorCopySync(in_tensor, platform::CPUPlace(), &inputs[i]);
+      } else {
+        inputs[i].ShareDataWith(in_tensor);
+      }
+      inputs[i].set_lod(in_tensor.lod());
+    }
+
+    std::vector<framework::LoDTensor *> outputs(out_arg_names.size());
+    for (size_t i = 0; i < out_arg_names.size(); ++i) {
+      auto *out_var = scope.FindVar(out_arg_names[i]);
+      outputs[i] =
+          out_var ? out_var->GetMutable<framework::LoDTensor>() : nullptr;
+    }
+
+    auto callable_id = static_cast<size_t>(Attr<int>(kForwardPythonCallableId));
+    auto *py_callable = GetPythonCallableObject(callable_id);
+    VLOG(10) << "Call Python function with id " << callable_id << ": "
+             << PythonFuncDebugString(*py_callable);
+    CallPythonFunc(py_callable, inputs, &outputs);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker,
+                  ops::PyFuncOpVarTypInference, ops::PyFuncOpShapeInference,
+                  ops::PyFuncOpGradDescMaker);
diff --git a/paddle/fluid/operators/py_func_op.h b/paddle/fluid/operators/py_func_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ba06bf59857d8cc4f1c56a52627d8e768ccbf7a
--- /dev/null
+++ b/paddle/fluid/operators/py_func_op.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+
+namespace paddle {
+namespace operators {
+
+size_t AppendPythonCallableObjectAndReturnId(const ::pybind11::object &py_obj);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/quantize_mkldnn_op.cc b/paddle/fluid/operators/quantize_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0638e42873376bcec6e4de61494da46d1f0073d1
--- /dev/null
+++ b/paddle/fluid/operators/quantize_mkldnn_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/quantize_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using platform::to_void_cast;
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+
+template <typename T>
+class QuantOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto scale_data = ctx.Attr<float>("Scale");
+    auto* output = ctx.Output<Tensor>("Output");
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& engine = dev_ctx.GetEngine();
+
+    std::vector<primitive> pipeline;
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+
+    const T* input_data = input->data<T>();
+
+    mkldnn::primitive_attr attri;
+    int mask = 0;
+    attri.set_output_scales(mask, {scale_data});
+
+    auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
+                                          input->format());
+    auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
+    auto src_memory =
+        std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
+    std::shared_ptr<primitive::at> src_memory_p =
+        std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
+
+    bool is_negative = ctx.Attr<bool>("is_negative_input");
+    std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd;
+    std::shared_ptr<mkldnn::memory> dst_memory;
+    if (is_negative) {
+      platform::ConvMKLDNNHandler::SetDstMemory<int8_t>(
+          ctx, output, dst_tz, engine, dst_pd, dst_memory);
+    } else {
+      platform::ConvMKLDNNHandler::SetDstMemory<uint8_t>(
+          ctx, output, dst_tz, engine, dst_pd, dst_memory);
+    }
+    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
+        new reorder::primitive_desc(src_pd, *dst_pd, attri));
+    auto reorder_p = std::shared_ptr<reorder>(
+        new reorder(*reorder_pd, *src_memory_p, *dst_memory));
+    pipeline.push_back(*reorder_p);
+    stream(stream::kind::eager).submit(pipeline).wait();
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetMKLDNNFormat(*dst_memory));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+// TODO(Xiaoli) Support FP32->S8 quantization.
+
+REGISTER_OP_KERNEL(quantize, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::QuantOpKernel<float>);
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf70c08bdb82218a2d0f63f3e70a2a1093e6a542
--- /dev/null
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/fluid/operators/quantize_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+framework::OpKernelType QuantOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
+  framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
+
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_);
+}
+
+void QuantOpMaker::Make() {
+  AddInput("Input", "input data");
+  AddOutput("Output", "output data");
+  AddAttr<bool>("is_negative_input",
+                "(bool, default false) Only used in mkldnn INT8 kernel")
+      .SetDefault(false);
+  AddAttr<float>("Scale", "scale data").SetDefault({1.0f});
+  AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC");
+}
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
diff --git a/paddle/fluid/operators/quantize_op.h b/paddle/fluid/operators/quantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..091306e4637c7e2393b6736f0e1edf9dd7fd2c8a
--- /dev/null
+++ b/paddle/fluid/operators/quantize_op.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+class QuantOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
+    ctx->ShareLoD("Input", /*->*/ "Output");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class QuantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/transpose_mkldnn_op.cc
index 37f1cadc7d2ff248e8b6dcb3f0c8ba09f8ccd8b5..e6df7028f540d0928e2bb0763bd4cfef12059665 100644
--- a/paddle/fluid/operators/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
@@ -29,10 +29,6 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    PADDLE_ENFORCE(
-        is_test == true,
-        "ConvTransposeMKLDNN works only for inference!. Set is_test = True");
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -47,69 +43,75 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       return;
     }
 
-    std::vector<int> nchw_axis(ndims, 0);
-    for (size_t i = 0; i < nchw_axis.size(); ++i) {
-      nchw_axis[i] = i;
-    }
-
     std::vector<int> nchw_tz = paddle::framework::vectorize2int(input->dims());
-    std::string data_format = ctx.Attr<std::string>("data_format");
 
-    auto src_md =
-        input->format() != mkldnn::memory::format::nchw
-            ? platform::MKLDNNMemDesc(nchw_tz, platform::MKLDNNGetDataType<T>(),
-                                      input->format())
-            : Axis2MemoryDesc(nchw_tz, nchw_axis);
+    const std::string key = platform::TransposeMKLDNNHandler::GetHash(
+        nchw_tz, axis, ctx.op().Output("Out"));
+
+    platform::TransposeMKLDNNHandler handler(nchw_tz, axis, dev_ctx,
+                                             mkldnn_engine, key);
 
-    this->TransposeKernel(ctx.GetPlace(), Axis2MemoryDesc(nchw_tz, axis),
-                          src_md, output, input_data, nchw_tz, mkldnn_engine);
+    auto transpose_src_memory_p = handler.AcquireSrcMemory(
+        input->format(), platform::to_void_cast<T>(input_data));
+    auto transpose_dst_memory_p =
+        handler.AcquireDstMemory(output, ctx.GetPlace());
+    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
+                                                transpose_src_memory_p);
+
+    std::vector<mkldnn::primitive> pipeline;
+    pipeline.push_back(*transpose_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }
+};
 
- protected:
-  mkldnn::memory::desc Axis2MemoryDesc(std::vector<int>& nchw_tz,
-                                       std::vector<int>& axis) const {
-    mkldnn_memory_desc_t mem_fmt;
+template <typename T>
+class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    if (!x_grad) return;
 
-    mem_fmt.primitive_kind = mkldnn_memory;
-    mem_fmt.ndims = axis.size();
-    for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
-      mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
-                                     // regardless physical layout)
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    int ndims = axis.size();
+    if (ndims == 1) {
+      x_grad->ShareDataWith(*out_grad);
+      return;
     }
-    mem_fmt.data_type = mkldnn_f32;
-    mem_fmt.format = mkldnn_blocked;
-
-    unsigned int total_stride = 1;
-    for (int i = nchw_tz.size() - 1; i >= 0; --i) {
-      mem_fmt.layout_desc.blocking.padding_dims[i] =
-          nchw_tz[i];  // logical dimensions (nchw format, regardless physical
-                       // layout)
-      mem_fmt.layout_desc.blocking.block_dims[i] = 1;
-      mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
-      mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
-      mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1;
-      total_stride *= nchw_tz[axis[i]];
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
     }
-    mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
-    return mem_fmt;
-  }
 
-  void TransposeKernel(platform::Place place, mkldnn::memory::desc md_o,
-                       mkldnn::memory::desc md_i, Tensor* output,
-                       const T* data_i, std::vector<int>& nchw_dims,
-                       const mkldnn::engine& eng) const {
-    // Make Memory primitive descriptors
-    auto mpd_o = mkldnn::memory::primitive_desc(md_o, eng);
-    auto mpd_i = mkldnn::memory::primitive_desc(md_i, eng);
+    const T* out_grad_data = out_grad->data<T>();
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> nchw_tz =
+        paddle::framework::vectorize2int(out_grad->dims());
 
-    auto data_o = output->mutable_data<T>(
-        place, paddle::memory::Allocator::kDefault, mpd_o.get_size());
+    const std::string key = platform::TransposeMKLDNNHandler::GetHash(
+        nchw_tz, axis, ctx.op().Output(framework::GradVarName("X")));
 
-    auto src = mkldnn::memory(mpd_i, (T*)(data_i));
-    auto dst = mkldnn::memory(mpd_o, data_o);
+    platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
+                                             mkldnn_engine, key);
 
-    auto r = mkldnn::reorder(src, dst);
-    mkldnn::stream(mkldnn::stream::kind::eager).submit({r}).wait();
+    auto transpose_src_memory_p = handler.AcquireSrcMemory(
+        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
+    auto transpose_dst_memory_p =
+        handler.AcquireDstMemory(x_grad, ctx.GetPlace());
+    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
+                                                transpose_src_memory_p);
+
+    std::vector<mkldnn::primitive> pipeline;
+    pipeline.push_back(*transpose_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }
 };
 
@@ -122,3 +124,8 @@ REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL(transpose_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNGradOpKernel<float>);
+REGISTER_OP_KERNEL(transpose2_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index b3b379d16ff099ba244fc92ed149a0089c2750e4..db14d350c7d92629873dfc5bc9181f651582e47c 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -79,10 +79,6 @@ class TransposeOp : public framework::OperatorWithKernel {
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
     AddInput(
         "X",
         "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
@@ -147,6 +143,24 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace(), layout_, library_);
+  }
 };
 
 // FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
@@ -237,9 +251,19 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
     return framework::OpKernelType(
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
+        ctx.GetPlace(), layout_, library_);
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 2f205e1d5ca30d67a55e4df0f5e879ffef9a9c26..d1dff16ddd859e6bf19ec22420c28819a9f14d50 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,6 +56,8 @@ ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
 
+cc_library(temp_allocator SRCS temporary_allocator.cc DEPS  allocator_facade)
+
 nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
 IF(WITH_GPU)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
@@ -66,7 +68,8 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}  temp_allocator)
+
 if(WIN32)
     if(WITH_GPU AND NOT WITH_DSO)
         get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
@@ -92,3 +95,9 @@ IF(WITH_GPU)
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
 ENDIF()
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+
+if(WITH_GPU)
+    nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+else()
+    cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+endif()
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index f9a32bfa4c15261ba6b79fc4efd3a1961f7c6d4d..9d5ae813de0f5861d7bc97c9dc2885d91b7240fb 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
-
 #elif defined(_WIN32)
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #include <windows.h>
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index d2e23d80f437e1df9216fa36e99a9be394dda074..022afb686b29c2c493cfd05600ee372470cbc710 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -85,6 +85,49 @@ DeviceContextPool::DeviceContextPool(
   }
 }
 
+DeviceTemporaryAllocator* DeviceTemporaryAllocator::allocators = nullptr;
+
+#ifdef PADDLE_WITH_CUDA
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::Place& place, const cudaStream_t& stream) {
+  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  auto place_stream = std::make_pair(place, stream);
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    if (!device_allocator_.count(place_stream)) {
+      device_allocator_[place_stream].reset(new TemporaryAllocator(place));
+      device_allocator_[place_stream]->SetCallback([stream]() {
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+        PADDLE_ENFORCE(cudaGetLastError());
+      });
+    }
+  }
+  return *device_allocator_.at(place_stream);
+}
+
+template <>
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::CUDADeviceContext& dev_ctx) {
+  auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream());
+  if (device_allocator_.count(place_stream)) {
+    return *device_allocator_.at(place_stream);
+  }
+  return Get(dev_ctx.GetPlace(), dev_ctx.stream());
+}
+#endif
+
+template <>
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::CPUDeviceContext& dev_ctx) {
+  return cpu_allocator_;
+}
+
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::Place& place) {
+  PADDLE_ENFORCE(platform::is_cpu_place(place), "You should pass CPUPlace");
+  return cpu_allocator_;
+}
+
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
@@ -213,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
 
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
                           << ", CUDA Capability: " << compute_capability_
-                          << ", Driver Version: " << driver_version_ / 1000
+                          << ", Driver API Version: " << driver_version_ / 1000
                           << "." << (driver_version_ % 100) / 10
-                          << ", Runtime Version: " << runtime_version_ / 1000
-                          << "." << (runtime_version_ % 100) / 10;
+                          << ", Runtime API Version: "
+                          << runtime_version_ / 1000 << "."
+                          << (runtime_version_ % 100) / 10;
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
@@ -271,8 +315,12 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-  PADDLE_ENFORCE(cudaGetLastError());
+  auto& allocator =
+      DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this);
+  allocator.Release([=]() {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+    PADDLE_ENFORCE(cudaGetLastError());
+  });
 }
 
 int CUDADeviceContext::GetComputeCapability() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 812e56f1f966d03207cf83ad47cb88e9fa5d55bb..7e875801893f3b73f8efaf33af690f8c855beee4 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
@@ -39,6 +41,71 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+/*! \brief device temporary allocator singleton.
+ *
+ * Some operator needs temporary memory during computation, for example,
+ * conv_gemm, which needs use col to store the result of im2col. If we
+ * create a stack memory which is used by CUDA Kernel, before the
+ * Computation(...) returns, we should add ctx->Wait(), because the
+ * execution of CUDA is async, if there doesn't have ctx->Wait(),
+ * the temporary memory will be released before the CUDA Kernel uses
+ * it.
+ *
+ * DeviceTemporaryAllocator is a singleton, which contains a
+ * `TemporaryAllocator` for each <Place, Stream>. And the TemporaryAllocator
+ * contains a temp_allocation_queue which is used to store the temporary
+ * allocations. The allocation, which is allocated by TemporaryAllocator,
+ * is a unique_ptr,  and when it is not held by any variable, it will be
+ * pushed into the temp_allocation_queue. There are two opportunities to free
+ * the allocations of temp_allocation_queue:
+ *  - when the Stream calls cudaStreamSynchronize;
+ *  - when the allocation size of opportunities exceeds a certain threshold
+ *    (defined by FLAGS_limit_of_temporary_allocation).
+ *
+ * */
+class DeviceTemporaryAllocator {
+ public:
+  static DeviceTemporaryAllocator& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(allocators,
+                            "Need to Create DeviceTemporaryAllocator first!");
+    return *allocators;
+  }
+
+  static DeviceTemporaryAllocator& Init() {
+    if (allocators == nullptr) {
+      allocators = new DeviceTemporaryAllocator();
+    }
+    return *allocators;
+  }
+
+/*! \brief  Return handle of single temporary allocator. */
+#ifdef PADDLE_WITH_CUDA
+  platform::TemporaryAllocator& Get(const platform::Place& place,
+                                    const cudaStream_t& stream);
+#endif
+  template <typename DeviceContext>
+  platform::TemporaryAllocator& Get(const DeviceContext& dev_ctx);
+
+  platform::TemporaryAllocator& Get(const platform::Place& place);
+
+ private:
+  DeviceTemporaryAllocator() : cpu_allocator_(platform::CPUPlace()) {}
+
+  static DeviceTemporaryAllocator* allocators;
+
+  platform::TemporaryAllocator cpu_allocator_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::map<std::pair<platform::Place, cudaStream_t>,
+           std::unique_ptr<platform::TemporaryAllocator>>
+      device_allocator_;
+#endif
+
+  std::mutex mtx_;
+
+  DISABLE_COPY_AND_ASSIGN(DeviceTemporaryAllocator);
+};
+
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index eddebfe92ae80be7e70090aca041df1c6ea4cd11..990e44cd211c001c436dce8ff74a89a5516b38ae 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -227,6 +227,8 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
 #endif
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0d10d82d74a2011b1b2bc088fe88cbfdb49600b8..ac86b38a61c9d8e3e946d9fb3f46d8feba7c034d 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -110,7 +110,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   }
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
-
+  platform::DeviceTemporaryAllocator::Init();
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 1c6421f3fa6ffbe7d3c682611def9e87d2fae5b0..584df85e80203c383a89954aac73dd1dcd723f7c 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
@@ -181,6 +182,21 @@ class MKLDNNHandler {
     return dims2str(operand_dims) + suffix;
   }
 
+  template <typename M>
+  static void SetDstMemory(
+      const framework::ExecutionContext& ctx, framework::Tensor* output,
+      std::vector<int> dst_tz, const mkldnn::engine& engine,
+      std::shared_ptr<mkldnn::memory::primitive_desc>& dst_pd,  // NOLINT
+      std::shared_ptr<mkldnn::memory>& dst_memory) {            // NOLINT
+    M* output_data = output->mutable_data<M>(ctx.GetPlace());
+    auto dst_md = platform::MKLDNNMemDesc(
+        {dst_tz}, paddle::framework::ToMKLDNNDataType(
+                      framework::DataTypeTrait<M>::DataType),
+        mkldnn::memory::format::nhwc);
+    dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine));
+    dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<M>(output_data)));
+  }
+
  protected:
   static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
     std::string dstr = "";
@@ -197,6 +213,130 @@ class MKLDNNHandler {
   bool is_reusing_;
 };
 
+class TransposeMKLDNNHandler : public MKLDNNHandler {
+ public:
+  TransposeMKLDNNHandler(std::vector<int>& dims, std::vector<int>& axis,
+                         const platform::MKLDNNDeviceContext& dev_ctx,
+                         mkldnn::engine engine, const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        dims_(dims),
+        axis_(axis),
+        logical_axis_(dims.size(), 0) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::format& fmt, void* ptr) {
+    auto local_key = key_ + "@user_src_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      // Make memory descriptor using input format, unless it
+      // cannot be trusted (nchw) then make up memory fmt manually
+      for (size_t i = 0; i < logical_axis_.size(); ++i) {
+        logical_axis_[i] = i;
+      }
+      auto src_md = fmt != mkldnn::memory::format::nchw
+                        ? platform::MKLDNNMemDesc(
+                              dims_, platform::MKLDNNGetDataType<float>(), fmt)
+                        : Axis2MemoryDesc(dims_, logical_axis_);
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
+                                                   platform::Place place) {
+    auto local_key = key_ + "@user_dst_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      auto dst_mdp = mkldnn::memory::primitive_desc{
+          Axis2MemoryDesc(dims_, axis_), engine_};
+
+      auto dst_data = output->mutable_data<float>(
+          place, paddle::memory::Allocator::kDefault, dst_mdp.get_size());
+
+      mem_p = std::make_shared<mkldnn::memory>(dst_mdp, dst_data);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      auto dst_data = output->mutable_data<float>(place);
+      mem_p->set_data_handle(dst_data);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::reorder> AcquireTranspose(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p) {
+    auto prim_key = key_ + "@transpose_p";
+    auto transpose_p =
+        std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((transpose_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (transpose_p == nullptr) {
+      transpose_p =
+          std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
+      dev_ctx_.SetBlob(prim_key, transpose_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return transpose_p;
+  }
+
+  static std::string GetHash(std::vector<int>& shape,  // NOLINT
+                             std::vector<int>& axis,   // NOLINT
+                             const std::string& suffix) {
+    return dims2str(shape) + dims2str(axis) + suffix;
+  }
+
+ protected:
+  mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz,
+                                       std::vector<int>& axis) {
+    mkldnn_memory_desc_t mem_fmt;
+
+    mem_fmt.primitive_kind = mkldnn_memory;
+    mem_fmt.ndims = axis.size();
+    for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
+      mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
+                                     // regardless physical layout)
+    }
+    mem_fmt.data_type = mkldnn_f32;
+    mem_fmt.format = mkldnn_blocked;
+
+    unsigned int total_stride = 1;
+    for (int i = nchw_tz.size() - 1; i >= 0; --i) {
+      mem_fmt.layout_desc.blocking.padding_dims[i] =
+          nchw_tz[i];  // logical dimensions (nchw format, regardless physical
+                       // layout)
+      mem_fmt.layout_desc.blocking.block_dims[i] = 1;
+      mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
+      mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
+      mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1;
+      total_stride *= nchw_tz[axis[i]];
+    }
+    mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
+    return mem_fmt;
+  }
+
+ private:
+  std::vector<int> dims_;
+  std::vector<int> axis_;
+  std::vector<int> logical_axis_;
+};
+
 template <class forward_t, class backward_data_t, class backward_weights_t>
 class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
  public:
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0be017f75bcc8aff5073ebb2c5179cf7250be8b9
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/temporary_allocator.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+DEFINE_double(limit_of_temporary_allocation, -1,
+              "The up limit of temporary_allocation size.");
+
+namespace paddle {
+namespace platform {
+namespace alloc = memory::allocation;
+
+TemporaryAllocation::TemporaryAllocation(
+    alloc::AllocationPtr &&underlying_allocation)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)) {}
+
+TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
+  temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+}
+
+bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
+
+void TemporaryAllocator::Release(const std::function<void()> &callback) {
+  std::shared_ptr<std::deque<TemporaryAllocation *>> t_allocations;
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    callback();
+    t_allocations = temp_mem_queue_;
+    temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+    wait_delete_mem_ = 0;
+  }
+  for (auto tmp : *t_allocations) {
+    VLOG(10) << "Delete temporary allocation " << tmp->ptr()
+             << " size: " << tmp->size();
+    delete tmp;
+  }
+}
+
+void TemporaryAllocator::Free(alloc::Allocation *allocation) {
+  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
+  if (platform::is_gpu_place(temp_allocation->place())) {
+    size_t wait_delete_mem = 0;
+    {
+      std::unique_lock<std::mutex> lock(mtx_);
+      temp_mem_queue_->emplace_back(temp_allocation);
+      wait_delete_mem_ += temp_allocation->size();
+      wait_delete_mem = wait_delete_mem_;
+      VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr()
+               << " to delete queue: " << temp_allocation->size() << "; "
+               << "wait_delete_mem: " << wait_delete_mem_;
+    }
+    if (FLAGS_limit_of_temporary_allocation > 0 &&
+        wait_delete_mem > FLAGS_limit_of_temporary_allocation) {
+      Release(callback_);
+    }
+    return;
+  }
+  delete temp_allocation;
+}
+
+size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
+  std::unique_lock<std::mutex> lock(mtx_);
+  return temp_mem_queue_ ? temp_mem_queue_->size() : 0;
+}
+
+void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
+  callback_ = callback;
+}
+
+alloc::Allocation *TemporaryAllocator::AllocateImpl(
+    size_t size, alloc::Allocator::Attr attr) {
+  auto raw_allocation =
+      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
+  VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
+  return temp_mem;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..812c4a333189d8c432be398ca0ebbce11f957561
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <condition_variable>  // NOLINT
+#include <deque>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+namespace paddle {
+namespace platform {
+
+class TemporaryAllocation : public memory::allocation::Allocation {
+ public:
+  explicit TemporaryAllocation(
+      memory::allocation::AllocationPtr &&underlying_allocation);
+
+  memory::allocation::AllocationPtr underlying_allocation_;
+};
+
+/*! \brief the TemporaryAllocator is used to alloc the temporary allocation
+ * which used by CUDA's async operation.
+ *
+ * The TemporaryAllocator contains a temp_allocation_queue which
+ * is used to store the temporary allocations. The allocation, which is
+ * allocated by TemporaryAllocator, is a unique_ptr, and when it is not held
+ * by any variable, it will be pushed into the  temp_allocation_queue.
+ *
+ * There is one opportunity to free the allocations of temp_allocation_queue:
+ *   - when the allocation size of opportunities exceeds a certain threshold
+ *     (defined by FLAGS_limit_of_temporary_allocation).
+ *
+ * */
+class TemporaryAllocator : public memory::allocation::Allocator {
+ public:
+  explicit TemporaryAllocator(platform::Place place);
+
+  void Release(const std::function<void()> &callback);
+
+  size_t TemporaryAllocationQueueSize();
+
+  bool IsAllocThreadSafe() const override;
+
+  void SetCallback(const std::function<void()> &callback);
+
+ protected:
+  void Free(memory::allocation::Allocation *allocation) override;
+
+  memory::allocation::Allocation *AllocateImpl(
+      size_t size, memory::allocation::Allocator::Attr attr) override;
+
+ private:
+  platform::Place place_;
+
+  // When the allocation is not held by any variable, it should be placed
+  // to temp_mem_queue immediately.
+  std::shared_ptr<std::deque<TemporaryAllocation *>> temp_mem_queue_{nullptr};
+
+  std::mutex mtx_;
+  size_t wait_delete_mem_{0};
+  std::function<void()> callback_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4e5be5b89f4cbecd6b5e9deec9cc5bffa6a4917
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/temporary_allocator.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/tensor_util.h"
+DECLARE_double(limit_of_temporary_allocation);
+
+namespace paddle {
+namespace platform {
+
+TEST(temporary_allocator, temporary_allocator) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator alloc(cpu_place);
+  alloc.Allocate(100);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  auto allocation = gpu_alloc.Allocate(101);
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+
+  {
+    auto allocation = gpu_alloc.Allocate(102);
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  }
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+TEST(temporary_allocator, add_callback) {
+#ifdef PADDLE_WITH_CUDA
+  const double limit = FLAGS_limit_of_temporary_allocation;
+  FLAGS_limit_of_temporary_allocation = 10;
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx =
+      static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
+  auto stream = dev_ctx->stream();
+  bool deleted = false;
+  gpu_alloc.SetCallback([stream, &deleted]() {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+    PADDLE_ENFORCE(cudaGetLastError());
+    deleted = true;
+  });
+  { gpu_alloc.Allocate(100); }
+  PADDLE_ENFORCE(deleted);
+  FLAGS_limit_of_temporary_allocation = limit;
+#endif
+}
+
+TEST(temporary_allocator, create_tensor_with_allocationptr) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator cpu_alloc(cpu_place);
+  {
+    size_t memory_size = 200;
+    auto allocation = cpu_alloc.Allocate(memory_size);
+    void* address = allocation->ptr();
+    int numel = memory_size / sizeof(float);
+    framework::Tensor tensor = framework::GetTensor<float>(
+        std::move(allocation), framework::make_ddim({numel}));
+    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  {
+    size_t memory_size = 300;
+    auto allocation = gpu_alloc.Allocate(memory_size);
+    void* address = allocation->ptr();
+    int numel = memory_size / sizeof(float);
+    framework::Tensor tensor = framework::GetTensor<float>(
+        std::move(allocation), framework::make_ddim({numel}));
+    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+  }
+
+  // The allocation is not holded now, it should be placed to
+  // TemporaryAllocationQueue.
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+TEST(temporary_allocator, create_tensor_with_allocationptr2) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator cpu_alloc(cpu_place);
+  {
+    size_t memory_size = 400;
+    int numel = memory_size / sizeof(float);
+
+    framework::Tensor out_side_tensor;
+    void* address;
+    {
+      auto allocation = cpu_alloc.Allocate(memory_size);
+      address = allocation->ptr();
+      framework::Tensor tensor = framework::GetTensor<float>(
+          std::move(allocation), framework::make_ddim({numel}));
+      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+
+      out_side_tensor.ShareDataWith(tensor);
+    }
+    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
+    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  {
+    void* address;
+    size_t memory_size = 500;
+    int numel = memory_size / sizeof(float);
+    framework::Tensor out_side_tensor;
+    {
+      auto allocation = gpu_alloc.Allocate(memory_size);
+      address = allocation->ptr();
+      framework::Tensor tensor = framework::GetTensor<float>(
+          std::move(allocation), framework::make_ddim({numel}));
+      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+
+      out_side_tensor.ShareDataWith(tensor);
+    }
+    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
+    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
+    // The allocation is holded by out_side_tensor.
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+    gpu_alloc.Release([]() {});
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  }
+
+  // The allocation is not holded now, it should be placed to
+  // TemporaryAllocationQueue.
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+}  //  namespace platform
+}  //  namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index c79d5d9403db613a8cdda59b9874a8b886458357..fb8bcb190bda59e23d118547f451be46c963cce9 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,5 +1,8 @@
 
 set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
+if(WITH_PYTHON)
+  list(APPEND PYBIND_DEPS py_func_op)
+endif()
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
 
 if(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index ac406b27b5c77d1d919713bafd24fd8b1e3580f1..4b218fb3a2af0933ea1e87abe20e7e031c32f721 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -328,7 +328,7 @@ void BindOpDesc(pybind11::module *m) {
       .def("infer_var_type", &pd::OpDesc::InferVarType)
       .def("set_is_target", &pd::OpDesc::SetIsTarget)
       .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
-      .def("block", &pd::OpDesc::Block,
+      .def("block", [](pd::OpDesc &self) { return self.Block(); },
            pybind11::return_value_policy::reference);
 }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f8a5c9deb066048ff645191c30b8c3dbf1d7eef5..88a2a5276ab52e25b4f790e3a2f1386ed0715b4e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -37,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -110,6 +111,12 @@ PYBIND11_MODULE(core, m) {
 
   BindException(&m);
 
+  m.def(
+      "_append_python_callable_object_and_return_id",
+      [](py::object py_obj) -> size_t {
+        return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
+      });
+
   py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
       .def(py::init<>())
       .def("_run_backward",
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
index eabb51d370aff709e289e1fc727aa2dbb551d82e..af033fa7407b8a81ebb162a2edff2fc41f8f5260 100644
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -35,16 +35,26 @@ add_executable(demo_trainer demo_trainer.cc)
 
 if(WITH_MKLDNN)
   include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-endif()
+  if(WIN32)
+    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
+  else(WIN32)
+    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
+  endif(WIN32)
+endif(WITH_MKLDNN)
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+  if(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
+  else(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+  endif(WIN32)
 else()
   if(APPLE)
     set(MATH_LIB cblas)
-  else(APPLE)
+  elseif(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
+  else()
     set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
   endif(APPLE)
 endif()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 139176b0d6c5dff511a97c9ac01f09e72a90306b..72c0d03e52246615d731719a7651010a4ede7e05 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -48,18 +48,13 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 IF(WIN32)
     # Python would use the .pyd by default under Windows series platform
     set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
-    get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
     set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
-    add_custom_command(OUTPUT ${FLUID_CORE}
-            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-            COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
-            DEPENDS paddle_pybind)
 ELSE()
     set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
-    add_custom_command(OUTPUT ${FLUID_CORE}
-            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-            DEPENDS paddle_pybind)
 ENDIF()
+add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+        DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
 IF(WIN32)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index ece97b661fd7d60f8822439a84ee4403b9e3d81c..24621110b18f63779da14edc42765aae3bf4abd6 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,6 +22,8 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import slim
+from .slim import *
 from . import utils
 from .utils import *
 
@@ -30,4 +32,5 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += slim.__all__
 __all__ += utils.__all__
diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..22dbf7c8b6bb2da7c310a20bdcbaffca248575b0
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/__init__.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import *
+from .graph import *
+from .prune import *
+__all__ = [
+    'build_compressor',
+    'CompressPass',
+    'ImitationGraph',
+    'SensitivePruneStrategy',
+    'MagnitudePruner',
+    'RatioPruner',
+]
diff --git a/python/paddle/fluid/contrib/slim/core/__init__.py b/python/paddle/fluid/contrib/slim/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7826d5830a6f7f6d42cb1275c2289695c080e52f
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import config
+from .config import *
+from . import compress_pass
+from .compress_pass import *
+from . import strategy
+from .strategy import *
+from . import pass_builder
+from .pass_builder import *
+
+__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__
diff --git a/python/paddle/fluid/contrib/slim/core/compress_pass.py b/python/paddle/fluid/contrib/slim/core/compress_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c348b878a1df43d7fb909f506c8cf65366866f
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/compress_pass.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core import CPUPlace
+from ..graph import get_executor
+
+__all__ = ['Context', 'CompressPass']
+
+
+class Context(object):
+    """
+    The context in the process of compression.
+    Args:
+        exe: The executor used to execute graph.
+        graph: The graph to be compressed.
+        scope: The scope used to execute graph.
+        program_exe: The program_exe is used to execute the program
+                     created for modifying the variables in scope.
+    """
+
+    def __init__(self, exe, graph, scope, program_exe=None):
+        # The total number of epoches to be trained.
+        self.epoch = 0
+        # Current epoch
+        self.epoch_id = 0
+        # Current batch
+        self.batch_id = 0
+        self.exe = exe
+        self.graph = graph
+        self.scope = scope
+        self.program_exe = program_exe
+
+
+class CompressPass(object):
+    """
+    The pass used to compress model.
+    Args:
+        place: The device used in compression.
+        data_reader: The data_reader used to run graph.
+        data_feeder: The data_feeder used to run graph.
+        scope: The scope used to run graph.
+        metrics: The metrics for evaluating model.
+        epoch: The total epoches of trainning in compression.
+        program_exe: The program_exe is used to execute the program
+                     created for modifying the variables in scope.
+    """
+
+    def __init__(self,
+                 place=None,
+                 data_reader=None,
+                 data_feeder=None,
+                 scope=None,
+                 metrics=None,
+                 epoch=None,
+                 program_exe=None):
+        self.strategies = []
+        self.place = CPUPlace() if place is None else place
+        self.data_reader = data_reader
+        self.data_feeder = data_feeder
+        self.scope = scope
+        self.metrics = metrics
+        self.epoch = epoch
+        self.program_exe = program_exe
+
+    def add_strategy(self, strategy):
+        """
+        Add a strategy to current compress pass.
+        Args:
+            strategy: The strategy to be added into current compress pass.
+        """
+        self.strategies.append(strategy)
+        self.epoch = max(strategy.end_epoch, self.epoch)
+
+    def apply(self, graph):
+        """
+        Compress a model.
+        Args:
+            graph: The target graph to be compressed.
+        """
+        self.executor = get_executor(graph, self.place)
+        context = Context(
+            self.executor, graph, self.scope, program_exe=self.program_exe)
+
+        for strategy in self.strategies:
+            strategy.on_compress_begin(context)
+
+        for epoch in range(self.epoch):
+
+            for strategy in self.strategies:
+                strategy.on_epoch_begin(context)
+
+            for data in self.data_reader():
+
+                for strategy in self.strategies:
+                    strategy.on_batch_begin(context)
+                fetches = None
+                if self.metrics:
+                    fetches = self.metrics.values()
+                feed = None
+                if self.data_feeder:
+                    feed = self.data_feeder.feed(data)
+                results = self.executor.run(graph,
+                                            fetches=fetches,
+                                            scope=self.scope,
+                                            feed=feed)
+                if results:
+                    print("results: {}".format(
+                        zip(self.metrics.keys(), results)))
+                for strategy in self.strategies:
+                    strategy.on_batch_end(context)
+                context.batch_id += 1
+
+            for strategy in self.strategies:
+                strategy.on_epoch_end(context)
+            context.epoch_id += 1
+
+        for strategy in self.strategies:
+            strategy.on_compress_end(context)
diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..811c45700376aff9883fe197007b582f63817f03
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/config.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import funcsigs
+import yaml
+from collections import OrderedDict
+from ..prune import *
+from .compress_pass import *
+from .strategy import *
+
+__all__ = ['ConfigFactory']
+"""This factory is used to create instances by loading and parsing configure file with yaml format.
+"""
+
+
+class ConfigFactory(object):
+    def __init__(self, config):
+        """Init a factory from configure file."""
+        self.instances = {}
+        self.version = None
+        self._parse_config(config)
+
+    def get_compress_pass(self):
+        """
+        Get compress pass from factory.
+        """
+        return self.instance('compress_pass')
+
+    def instance(self, name):
+        """
+        Get instance from factory.
+        """
+        if name in self.instances:
+            return self.instances[name]
+        else:
+            return None
+
+    def _new_instance(self, name, attrs):
+        if name not in self.instances:
+            class_ = globals()[attrs['class']]
+            sig = funcsigs.signature(class_.__init__)
+            keys = [
+                param.name for param in sig.parameters.values()
+                if (param.kind == param.POSITIONAL_OR_KEYWORD)
+            ][1:]
+            keys = set(attrs.keys()).intersection(set(keys))
+            args = {}
+            for key in keys:
+                value = attrs[key]
+                if isinstance(value, str) and value in self.instances:
+                    value = self.instances[value]
+                args[key] = value
+            self.instances[name] = class_(**args)
+        return self.instances.get(name)
+
+    def _parse_config(self, config):
+        assert config
+        with open(config, 'r') as config_file:
+            key_values = self._ordered_load(config_file)
+            for key in key_values:
+                # parse version
+                if key == 'version' and self.version is None:
+                    self.version = int(key_values['version'])
+                    assert self.version == int(key_values['version'])
+
+                # parse pruners
+                if key == 'pruners' or key == 'strategies':
+                    instances = key_values[key]
+                    for name in instances:
+                        self._new_instance(name, instances[name])
+
+                if key == 'compress_pass':
+                    compress_pass = self._new_instance(key, key_values[key])
+                    for name in key_values[key]['strategies']:
+                        strategy = self.instance(name)
+                        compress_pass.add_strategy(strategy)
+
+                if key == 'include':
+                    for config_file in key_values[key]:
+                        self._parse_config(config_file.strip())
+
+    def _ordered_load(self,
+                      stream,
+                      Loader=yaml.Loader,
+                      object_pairs_hook=OrderedDict):
+        """
+        See: https://stackoverflow.com/questions/5121931/in-python-how-can-you-load-yaml-mappings-as-ordereddicts
+        """
+
+        class OrderedLoader(Loader):
+            pass
+
+        def construct_mapping(loader, node):
+            loader.flatten_mapping(node)
+            return object_pairs_hook(loader.construct_pairs(node))
+
+        OrderedLoader.add_constructor(
+            yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping)
+        return yaml.load(stream, OrderedLoader)
diff --git a/python/paddle/fluid/contrib/slim/core/pass_builder.py b/python/paddle/fluid/contrib/slim/core/pass_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc1ddc94e04f1d606292071ba7e5cc74fedd5d36
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/pass_builder.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .compress_pass import CompressPass
+from .config import ConfigFactory
+
+__all__ = ['build_compressor']
+
+
+def build_compressor(place=None,
+                     data_reader=None,
+                     data_feeder=None,
+                     scope=None,
+                     metrics=None,
+                     epoch=None,
+                     config=None):
+    if config is not None:
+        factory = ConfigFactory(config)
+        comp_pass = factory.get_compress_pass()
+    else:
+        comp_pass = CompressPass()
+    comp_pass.place = place
+    comp_pass.data_reader = data_reader
+    comp_pass.data_feeder = data_feeder
+    comp_pass.scope = scope
+    comp_pass.metrics = metrics
+    comp_pass.epoch = epoch
+    return comp_pass
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d98e98b0c390599acfaefeb0636a599b46d391
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['Strategy']
+
+
+class Strategy(object):
+    """
+    Base class for all strategies.
+    """
+
+    def __init__(self, start_epoch=0, end_epoch=10):
+        """
+        Args:
+            start_epoch: The first epoch to apply the strategy.
+            end_epoch: The last epoch to apply the strategy.
+        """
+        self.start_epoch = start_epoch
+        self.end_epoch = end_epoch
+
+    def on_compress_begin(self, context):
+        pass
+
+    def on_epoch_begin(self, context):
+        pass
+
+    def on_epoch_end(self, context):
+        pass
+
+    def on_batch_begin(self, context):
+        pass
+
+    def on_batch_end(self, context):
+        pass
+
+    def on_compress_end(self, context):
+        pass
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea888fa2c74a23b4769f75dce6a776afcca41a51
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
@@ -0,0 +1,28 @@
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.3
+            'conv1_2.w': 0.4
+            '*': 0.9
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
+strategies:
+    strategy_1:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 0
+        end_epoch: 10
+        delta_rate: 0.20
+        acc_loss_threshold: 0.2
+        sensitivities:
+            'conv1_1.w': 0.4
+
+compress_pass:
+    class: 'CompressPass'
+    epoch: 100
+    strategies:
+        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..21c59c0c9d2d9b76932ab6eeff73754940a3bfa0
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle
+import os
+import sys
+from paddle.fluid.contrib.slim import CompressPass
+from paddle.fluid.contrib.slim import build_compressor
+from paddle.fluid.contrib.slim import ImitationGraph
+
+
+class LinearModel(object):
+    def __init__(slef):
+        pass
+
+    def train(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        startup_program.random_seed = 10
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+            eval_program = train_program.clone()
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+
+        train_reader = paddle.batch(
+            paddle.dataset.uci_housing.train(), batch_size=1)
+        eval_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=1)
+        place = fluid.CPUPlace()
+        train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+        train_metrics = {"loss": avg_cost.name}
+        eval_metrics = {"loss": avg_cost.name}
+
+        graph = ImitationGraph(train_program)
+        config = './config.yaml'
+        comp_pass = build_compressor(
+            place,
+            data_reader=train_reader,
+            data_feeder=train_feeder,
+            scope=fluid.global_scope(),
+            metrics=train_metrics,
+            epoch=1,
+            config=config)
+        comp_pass.apply(graph)
+
+
+if __name__ == "__main__":
+    model = LinearModel()
+    model.train()
diff --git a/python/paddle/fluid/contrib/slim/graph/__init__.py b/python/paddle/fluid/contrib/slim/graph/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d65472d193b639f0766e278ec14b5dc36c5d62bc
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import executor
+from .executor import *
+from . import graph
+from .graph import *
+from . import graph_pass
+from .graph_pass import *
+__all__ = executor.__all__
+__all__ += graph.__all__
+__all__ += graph_pass.__all__
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02c3af82013287bf19e1869cb60dc65239b720a
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from abc import abstractmethod
+from .... import executor
+from .graph import IRGraph, ImitationGraph
+
+__all__ = ['get_executor']
+
+
+class GraphExecutor(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, place):
+        self.place = place
+
+    @abstractmethod
+    def run(self, graph, feches=None, feed=None):
+        pass
+
+
+class IRGraphExecutor(GraphExecutor):
+    def run(self, grah, fetches, feed=None):
+        pass
+
+
+class ImitationGraphExecutor(GraphExecutor):
+    def __init__(self, place):
+        super(ImitationGraphExecutor, self).__init__(place)
+        self.exe = executor.Executor(place)
+
+    def run(self, graph, scope=None, fetches=None, feed=None):
+        assert isinstance(graph, ImitationGraph)
+        fetch_list = None
+        if fetches:
+            fetch_list = [
+                graph.program.global_block().var(name) for name in fetches
+            ]
+        results = self.exe.run(graph.program,
+                               scope=scope,
+                               fetch_list=fetch_list,
+                               feed=feed)
+        return results
+
+
+def get_executor(graph, place):
+    if isinstance(graph, ImitationGraph):
+        return ImitationGraphExecutor(place)
+    if isinstance(graph, IRGraph):
+        return IRGraphExecutor(place)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6b0702035d49189c0919f976ea3c0c52663547
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....framework import Program
+
+__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
+
+
+class Graph(object):
+    """
+    Base class for all graph.
+    """
+
+    def __init__(self):
+        pass
+
+    def all_parameters(self):
+        """
+        Return all the parameters in current graph.
+        """
+        pass
+
+
+class ImitationGraph(Graph):
+    def __init__(self, program=None):
+        super(ImitationGraph, self).__init__()
+        self.program = Program() if program is None else program
+
+    def all_parameters(self):
+        return self.program.global_block().all_parameters()
+
+
+class IRGraph(Graph):
+    pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_pass.py b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..1db6c4f110daa44be7fcbcc36f47224797b6dc88
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
@@ -0,0 +1,42 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['GraphPass', 'PruneParameterPass']
+
+
+class GraphPass(object):
+    """
+    Base class for all graph pass.
+    """
+
+    def __init__(self):
+        pass
+
+    def apply(self, graph):
+        pass
+
+
+class PruneParameterPass(GraphPass):
+    """
+    Generate a graph for pruning parameters from target graph.
+    """
+
+    def __init__(self, pruned_params, thresholds):
+        super(PruneParameterPass, self).__init__()
+        self.pruned_params = pruned_params
+        self.thresholds = thresholds
+        self.default_threshold = thresholds['*']
+
+    def apply(self, graph):
+        pass
diff --git a/python/paddle/fluid/contrib/slim/prune/__init__.py b/python/paddle/fluid/contrib/slim/prune/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..764a45bb130a9993015858f1cbdbc9f3b864bd5e
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/__init__.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import pruner
+from .pruner import *
+from . import prune_strategy
+from .prune_strategy import *
+
+__all__ = pruner.__all__
+__all__ += prune_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..34c5107daa3cde10e7995902be37e34e19664da8
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core.strategy import Strategy
+from ....framework import Program, program_guard
+from .... import layers
+import numpy as np
+
+__all__ = ['SensitivePruneStrategy', 'PruneStrategy']
+
+
+class SensitivePruneStrategy(Strategy):
+    def __init__(self,
+                 pruner=None,
+                 start_epoch=0,
+                 end_epoch=10,
+                 delta_rate=0.20,
+                 acc_loss_threshold=0.2,
+                 sensitivities=None):
+        super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch)
+        self.pruner = pruner
+        self.delta_rate = delta_rate
+        self.acc_loss_threshold = acc_loss_threshold
+        self.sensitivities = sensitivities
+
+
+class PruneStrategy(Strategy):
+    """
+    The strategy that pruning weights by threshold or ratio iteratively.
+    """
+
+    def __init__(self,
+                 pruner,
+                 mini_batch_pruning_frequency=1,
+                 start_epoch=0,
+                 end_epoch=10):
+        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
+        self.pruner = pruner
+        self.mini_batch_pruning_frequency = mini_batch_pruning_frequency
+
+    def _triger(self, context):
+        return (context.batch_id % self.mini_batch_pruning_frequency == 0 and
+                self.start_epoch <= context.epoch_id < self.end_epoch)
+
+    def on_batch_end(self, context):
+        if self._triger(context):
+            prune_program = Program()
+            with program_guard(prune_program):
+                for param in context.graph.all_parameters():
+                    prune_program.global_block().clone_variable(param)
+                    p = prune_program.global_block().var(param.name)
+                    zeros_mask = self.pruner.prune(p)
+                    pruned_param = p * zeros_mask
+                    layers.assign(input=pruned_param, output=param)
+            context.program_exe.run(prune_program, scope=context.scope)
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca72bcb6f6004c18f3ec794850e0aeaecb92d7ac
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from .... import layers
+
+__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner']
+
+
+class Pruner(object):
+    """
+    Base class of all pruners.
+    """
+
+    def __init__(self):
+        pass
+
+    def prune(self, param):
+        pass
+
+
+class MagnitudePruner(Pruner):
+    """
+    Pruner used to pruning a parameter by threshold.
+    """
+
+    def __init__(self, threshold):
+        self.threshold = threshold
+
+    def prune(self, param, threshold=None):
+        if threshold is None:
+            thres = layers.fill_constant(
+                shape=[1], dtype='float32', value=self.threshold)
+        else:
+            thres = threshold
+        zeros_mask = layers.less_than(x=param, y=thres)
+        return zeros_mask
+
+
+class RatioPruner(Pruner):
+    """
+    Pruner used to pruning a parameter by ratio.
+    """
+
+    def __init__(self, ratios=None):
+        """
+        Args:
+            ratios: dict with pair (paramer_name, pruned_ratio). 
+        """
+        self.ratios = ratios
+
+    def prune(self, param, ratio=None):
+        """
+        Args:
+            ratio: `ratio=40%` means pruning (1 - 40%) weights to zero.
+        """
+        if ratio is None:
+            rat = self.ratios[
+                param.name] if param.name in self.ratios else self.ratios['*']
+        else:
+            rat = ratio
+        if rat < 1.0:
+            k = max(int(rat * np.prod(param.shape)), 1)
+            param_vec = layers.reshape(x=param, shape=[1, -1])
+            param_topk, _ = layers.topk(param_vec, k=k)
+            threshold = layers.slice(
+                param_topk, axes=[1], starts=[-1], ends=[k])
+            threshold = layers.reshape(x=threshold, shape=[1])
+            zeros_mask = layers.less_than(x=param, y=threshold)
+        else:
+            zeros_mask = layers.ones(param.shape)
+        return zeros_mask
diff --git a/python/paddle/fluid/contrib/slim/unitest/__init__.py b/python/paddle/fluid/contrib/slim/unitest/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d41233e227dc7bab94ee4284cc25e12b45bf469
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db488b96330210df15b02b19d90abd5c9101f844
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
@@ -0,0 +1,29 @@
+version: 1.0
+include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"]
+pruners:
+    pruner_1:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.3
+            'conv1_2.w': 0.4
+            '*': 0.9
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
+strategies:
+    strategy_1:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_2'
+        start_epoch: 0
+        end_epoch: 10
+        delta_rate: 0.20
+        acc_loss_threshold: 0.2
+        sensitivities:
+            'conv1_1.w': 0.4
+
+compress_pass:
+    class: 'CompressPass'
+    epoch: 100
+    strategies:
+        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..235092c595bf7c653221c7fe2b381fecf487fa49
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
@@ -0,0 +1,12 @@
+version: 1.0
+pruners:
+    pruner_2:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.5
+            'conv1_2.w': 0.2
+            '*': 0.7
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd2ef9eb56ddbc1367ce2e3b413372fbcd542bde
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
@@ -0,0 +1,12 @@
+version: 1.0
+pruners:
+    pruner_3:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.5
+            'conv1_2.w': 0.2
+            '*': 0.7
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_factory.py b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f28aac905d1a2813dbde6143235c7916fd9278
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.contrib.slim import ConfigFactory
+import unittest
+
+
+class TestFactory(unittest.TestCase):
+    def test_parse(self):
+        factory = ConfigFactory('./unitest/configs/config.yaml')
+
+        pruner = factory.instance('pruner_1')
+        self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
+
+        pruner = factory.instance('pruner_2')
+        self.assertEquals(pruner.ratios['*'], 0.7)
+
+        strategy = factory.instance('strategy_1')
+        pruner = strategy.pruner
+        self.assertEquals(pruner.ratios['*'], 0.7)
+
+        compress_pass = factory.get_compress_pass()
+        self.assertEquals(compress_pass.epoch, 100)
+
+        strategy = compress_pass.strategies[0]
+        self.assertEquals(strategy.delta_rate, 0.2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 06039b206b4ddb02e38035134e50b353b987074e..dde05189722fef77e03a1c2d8f3cbae44a3e8245 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -63,18 +63,14 @@ def noam_decay(d_model, warmup_steps):
     Returns:
         The decayed learning rate.
     """
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter(1)
 
-    def _lr_schedule(dtype):
-        with default_main_program()._lr_schedule_guard():
-            global_step = _decay_step_counter(1)
+        a = global_step**-0.5
+        b = (warmup_steps**-1.5) * global_step
+        lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
 
-            a = global_step**-0.5
-            b = (warmup_steps**-1.5) * global_step
-            lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
-
-        return lr_value
-
-    return _lr_schedule
+    return lr_value
 
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -113,19 +109,15 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
           sgd_optimizer.minimize(avg_cost)
 
     """
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
 
-    def _lr_schedule(dtype):
-        with default_main_program()._lr_schedule_guard():
-            global_step = _decay_step_counter()
-
-            div_res = global_step / decay_steps
-            if staircase:
-                div_res = ops.floor(div_res)
-            decayed_lr = learning_rate * (decay_rate**div_res)
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = ops.floor(div_res)
+        decayed_lr = learning_rate * (decay_rate**div_res)
 
-            return decayed_lr
-
-    return _lr_schedule
+        return decayed_lr
 
 
 def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -146,19 +138,15 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     Returns:
         The decayed learning rate
     """
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
 
-    def _lr_schedule(dtype):
-        with default_main_program()._lr_schedule_guard():
-            global_step = _decay_step_counter()
-
-            div_res = global_step / decay_steps
-            if staircase:
-                div_res = ops.floor(div_res)
-            decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
-
-            return decayed_lr
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = ops.floor(div_res)
+        decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
 
-    return _lr_schedule
+        return decayed_lr
 
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -196,20 +184,16 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
                     staircase=True))
           sgd_optimizer.minimize(avg_cost)
     """
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
 
-    def _lr_schedule(dtype):
-        with default_main_program()._lr_schedule_guard():
-            global_step = _decay_step_counter()
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = ops.floor(div_res)
 
-            div_res = global_step / decay_steps
-            if staircase:
-                div_res = ops.floor(div_res)
+        decayed_lr = learning_rate / (1 + decay_rate * div_res)
 
-            decayed_lr = learning_rate / (1 + decay_rate * div_res)
-
-            return decayed_lr
-
-    return _lr_schedule
+        return decayed_lr
 
 
 def polynomial_decay(learning_rate,
@@ -240,33 +224,28 @@ def polynomial_decay(learning_rate,
     Returns:
         Variable: The decayed learning rate
     """
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
 
-    def _lr_schedule(dtype, decay_steps=decay_steps):
-        with default_main_program()._lr_schedule_guard():
-            global_step = _decay_step_counter()
-
-            if cycle:
-                div_res = ops.ceil(global_step / decay_steps)
-                zero_var = tensor.fill_constant(
-                    shape=[1], dtype=dtype, value=0.0)
-                one_var = tensor.fill_constant(
-                    shape=[1], dtype=dtype, value=1.0)
-
-                with control_flow.Switch() as switch:
-                    with switch.case(global_step == zero_var):
-                        tensor.assign(input=one_var, output=div_res)
-                decay_steps = decay_steps * div_res
-            else:
-                decay_steps_var = tensor.fill_constant(
-                    shape=[1], dtype=dtype, value=float(decay_steps))
-                global_step = nn.elementwise_min(
-                    x=global_step, y=decay_steps_var)
+        if cycle:
+            div_res = ops.ceil(global_step / decay_steps)
+            zero_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
 
-            decayed_lr = (learning_rate - end_learning_rate) * \
-                ((1 - global_step / decay_steps) ** power) + end_learning_rate
-            return decayed_lr
+            with control_flow.Switch() as switch:
+                with switch.case(global_step == zero_var):
+                    tensor.assign(input=one_var, output=div_res)
+            decay_steps = decay_steps * div_res
+        else:
+            decay_steps_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=float(decay_steps))
+            global_step = nn.elementwise_min(x=global_step, y=decay_steps_var)
 
-    return _lr_schedule
+        decayed_lr = (learning_rate - end_learning_rate) * \
+            ((1 - global_step / decay_steps) ** power) + end_learning_rate
+        return decayed_lr
 
 
 def piecewise_decay(boundaries, values):
@@ -294,42 +273,38 @@ def piecewise_decay(boundaries, values):
 
 
     """
-
-    def _lr_schedule(dtype):
-        with default_main_program()._lr_schedule_guard():
-            if len(values) - len(boundaries) != 1:
-                raise ValueError("len(values) - len(boundaries) should be 1")
-
-            global_step = _decay_step_counter()
-
-            lr = tensor.create_global_var(
-                shape=[1],
-                value=0.0,
-                dtype='float32',
-                persistable=True,
-                name="learning_rate")
-
-            with control_flow.Switch() as switch:
-                for i in range(len(boundaries)):
-                    boundary_val = tensor.fill_constant(
-                        shape=[1],
-                        dtype='float32',
-                        value=float(boundaries[i]),
-                        force_cpu=True)
-                    value_var = tensor.fill_constant(
-                        shape=[1], dtype='float32', value=float(values[i]))
-                    with switch.case(global_step < boundary_val):
-                        tensor.assign(value_var, lr)
-                last_value_var = tensor.fill_constant(
+    with default_main_program()._lr_schedule_guard():
+        if len(values) - len(boundaries) != 1:
+            raise ValueError("len(values) - len(boundaries) should be 1")
+
+        global_step = _decay_step_counter()
+
+        lr = tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+
+        with control_flow.Switch() as switch:
+            for i in range(len(boundaries)):
+                boundary_val = tensor.fill_constant(
                     shape=[1],
                     dtype='float32',
-                    value=float(values[len(values) - 1]))
-                with switch.default():
-                    tensor.assign(last_value_var, lr)
-
-        return lr
+                    value=float(boundaries[i]),
+                    force_cpu=True)
+                value_var = tensor.fill_constant(
+                    shape=[1], dtype='float32', value=float(values[i]))
+                with switch.case(global_step < boundary_val):
+                    tensor.assign(value_var, lr)
+            last_value_var = tensor.fill_constant(
+                shape=[1],
+                dtype='float32',
+                value=float(values[len(values) - 1]))
+            with switch.default():
+                tensor.assign(last_value_var, lr)
 
-    return _lr_schedule
+    return lr
 
 
 def append_LARS(params_grads, learning_rate, weight_decay):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 4d44ce50a310cc6c95318a159b15544d8628e0bf..8ac7efee50d2df5084f1b93acdbc6708872e46b2 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -18,7 +18,9 @@ All layers just related to the neural network.
 from __future__ import print_function
 
 import numpy as np
+import six
 import os
+import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable, OpProtoHolder
@@ -176,6 +178,7 @@ __all__ = [
     'merge_selected_rows',
     'get_tensor_from_selected_rows',
     'lstm',
+    'py_func',
     'psroi_pool',
     'huber_loss',
 ]
@@ -7944,7 +7947,7 @@ def unstack(x, axis=0, num=None):
             num = x.shape[axis]
 
     outs = []
-    for _ in num:
+    for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
     helper.append_op(
@@ -9331,6 +9334,224 @@ def get_tensor_from_selected_rows(x, name=None):
     return out
 
 
+class PyFuncRegistry(object):
+    _register_funcs = []
+
+    def __init__(self, func):
+        if func is None or not callable(func):
+            raise TypeError('func must be a Python function')
+
+        self._func = func
+        # find named args using reflection 
+        args = inspect.getargspec(self._func)
+        if len(args[0]) == 0 and args[1] is None and args[2] is None:
+            # Function with no inputs
+            self._named_args = None
+        else:
+            self._named_args = args[0]
+        self._id = core._append_python_callable_object_and_return_id(self)
+        '''
+        Why record self here?
+
+        1. For debug usage. Users can call 
+           :code:`py_func.registered_func(idx)` method 
+           to find the registered function corresponding
+           to :code:`idx`. 
+
+        2. For increasing reference count of self. 
+           It seems that to release Python object 
+           whose reference count is 1 would cause
+           segmentation fault error in C++ side. 
+           May be lack of Python GC in C++ side?
+        '''
+        PyFuncRegistry._register_funcs.append(self)
+
+    @classmethod
+    def registered_func(cls, idx):
+        return cls._register_funcs[idx]._func
+
+    @classmethod
+    def registered_func_num(cls):
+        return len(cls._register_funcs)
+
+    @property
+    def id(self):
+        return self._id
+
+    def __call__(self, *args):
+        if self._named_args is None:
+            func_ret = self._func()
+        else:
+            kwargs = dict()
+            idx = 0
+            for arg in self._named_args:
+                kwargs[arg] = args[idx]
+                idx += 1
+            func_ret = self._func(*args[idx:], **kwargs)
+
+        if not isinstance(func_ret, (list, tuple)):
+            func_ret = (func_ret, )
+
+        ret = []
+        for each_ret in func_ret:
+            if each_ret is None or isinstance(each_ret, core.LoDTensor):
+                ret.append(each_ret)
+                continue
+
+            if not isinstance(each_ret, np.ndarray):
+                each_ret = np.array(each_ret)
+
+            tensor = core.LoDTensor()
+            tensor.set(each_ret, core.CPUPlace())
+            ret.append(tensor)
+
+        return tuple(ret)
+
+
+@templatedoc()
+def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
+    """
+    PyFunc Operator.
+    
+    User can use :code:`py_func` to register operators in Python side.
+    The inputs of :code:`func` is :code:`LoDTensor` and outputs can be
+    numpy array or :code:`LoDTensor`. Paddle would call the registered
+    :code:`func` in forward part, and call :code:`backward_func` in
+    backward part (if :code:`backward_func` is not None).
+
+    User should set the right data type and shape of :code:`out` before
+    calling this function. However, data types and shapes of gradients of
+    :code:`out` and :code:`x` would be inferred automatically.
+
+    Input orders of :code:`backward_func` would be: forward inputs
+    :code:`x`, forward outputs :code:`out` and backward input gradients of
+    :code:`out`. If some variables of :code:`out` have no gradient, the input
+    tensor would be None in Python side. If some variables of :code:`in` have
+    no gradient, users should return None.
+
+    This function can also be used to debug the running network. User can
+    add a :code:`py_func` operator without output, and print input 
+    :code:`x` inside :code:`func`.
+
+    Args:
+        func (callable): forward Python function.
+        x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`.
+        out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`.
+            Paddle cannot infer shapes and data types of :code:`out`. Users
+            should create :code:`out` beforehand. 
+        backward_func (callable|None): backward Python function.
+                                       None means no backward. Default None. 
+        skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)):
+            Variables that are not needed in :code:`backward_func` inputs. 
+            These variables must be any of :code:`x` and :code:`out`.
+            If set, these vars would not be inputs of :code:`backward_func`,
+            Only useful when :code:`backward_func` is not None. Default None. 
+
+    Returns:
+        out (Variable|list(Variable)|tuple(Variable)): input :code:`out`
+
+    Examples:
+    
+        >>> import paddle.fluid as fluid
+        >>> import six
+        >>>
+        >>> def create_tmp_var(name, dtype, shape):
+        >>>     return fluid.default_main_program().current_block().create_var(
+        >>>         name=name, dtype=dtype, shape=shape) 
+        >>>
+        >>> # tanh activation has been provided by Paddle C++ op
+        >>> # Here, we only use tanh to be an example to show the usage 
+        >>> # of py_func
+        >>> def tanh(x):
+        >>>     return np.tanh(x)
+        >>> 
+        >>> # forward input x is skipped
+        >>> def tanh_grad(y, dy):
+        >>>     return np.array(dy) * (1 - np.square(np.array(y)))
+        >>>
+        >>> def debug_func(x):
+        >>>     print(x) 
+        >>>
+        >>> def simple_net(img, label):
+        >>>     hidden = img
+        >>>     for idx in six.moves.range(4):
+        >>>         hidden = fluid.layers.fc(hidden, size=200)
+        >>>         new_hidden = create_tmp_var(name='hidden_{}'.format(idx),
+        >>>             dtype=hidden.dtype, shape=hidden.shape)    
+        >>>
+        >>>         # user-defined layers with forward and backward
+        >>>         hidden = fluid.layers.py_func(func=tanh, x=hidden, 
+        >>>             out=new_hidden, backward_func=tanh_grad, 
+        >>>             skip_vars_in_backward_input=hidden)
+        >>>
+        >>>         # user-defined debug layers to print variables
+        >>>         fluid.layers.py_func(func=debug_func, x=hidden, out=None)
+        >>>
+        >>>     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+        >>>     loss = fluid.layers.cross_entropy(input=prediction, label=label)
+        >>>     return fluid.layers.mean(loss)
+    """
+    helper = LayerHelper('py_func', **locals())
+    if x is None:
+        x = []
+    elif isinstance(x, Variable):
+        x = [x]
+    elif not isinstance(x, (list, tuple)):
+        raise TypeError('Input must be Variable/list(Variable)/tuple(Variable)')
+
+    if out is None:
+        out_list = []
+    elif isinstance(out, Variable):
+        out_list = [out]
+    elif isinstance(out, (list, tuple)):
+        out_list = out
+    else:
+        raise TypeError(
+            'Output must be Variable/list(Variable)/tuple(Variable)')
+
+    fwd_func_id = PyFuncRegistry(func).id
+    bwd_func_id = PyFuncRegistry(
+        backward_func).id if backward_func is not None else -1
+
+    for each_out in out_list:
+        if len(each_out.shape) == 0:
+            raise ValueError(
+                'Output shapes of py_func op should be provided by users manually'
+            )
+
+    backward_skip_vars = set()
+    if backward_func is not None and skip_vars_in_backward_input is not None:
+        if isinstance(skip_vars_in_backward_input, Variable):
+            skip_vars_in_backward_input = [skip_vars_in_backward_input]
+
+        fwd_in_out = [v.name for v in x]
+        fwd_in_out.extend([v.name for v in out_list])
+        fwd_in_out = set(fwd_in_out)
+        backward_skip_vars = set()
+        for v in skip_vars_in_backward_input:
+            if not v.name in fwd_in_out:
+                raise ValueError(
+                    'Variable {} is not found in forward inputs and outputs'
+                    .format(v.name))
+            backward_skip_vars.add(v.name)
+
+    helper.append_op(
+        type='py_func',
+        inputs={'X': x},
+        outputs={'Out': out_list},
+        attrs={
+            'forward_callable_id': fwd_func_id,
+            'backward_callable_id': bwd_func_id,
+            'backward_skip_vars': list(backward_skip_vars)
+        })
+    return out
+
+
+# For debug usage
+py_func.registered_func = PyFuncRegistry.registered_func
+py_func.registered_func_num = PyFuncRegistry.registered_func_num
+
+
 @templatedoc()
 def psroi_pool(input,
                rois,
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index 0b61c23d07e95acf7b4564753f748e7fb497e73e..8485d7d32fed8554c6d9afd610db230f52497da1 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 try:
-    from .graphviz import Digraph
+    from .graphviz import Graph
 except ImportError:
     logger.info(
         'Cannot import graphviz, which is required for drawing a network. This '
@@ -112,7 +112,7 @@ def draw_graph(startup_program, main_program, **kwargs):
     filename = kwargs.get("filename")
     if filename == None:
         filename = str(graph_id) + ".gv"
-    g = Digraph(
+    g = Graph(
         name=str(graph_id),
         filename=filename,
         graph_attr=GRAPH_STYLE,
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 58cfc498c9edd77163b2bd4cad2cb991b6f2b20c..59c22d4e498814d468c78b10265b7afe35461dfb 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -50,21 +50,17 @@ class Optimizer(object):
 
     def __init__(self, learning_rate, regularization=None, name=None):
         if not isinstance(learning_rate, float) and \
-                not isinstance(learning_rate, framework.Variable) and \
-                not callable(learning_rate):
-            raise TypeError(
-                "learning rate should be float or Variable or callable(dtype)")
+                not isinstance(learning_rate, framework.Variable):
+            raise TypeError("learning rate should be float or Variable")
         self._name = name
         self.regularization = regularization
         self._learning_rate = learning_rate
         # the learning rate type should be inferenced from loss
         self._dtype = None
         # each program should have a independent learning rate
-        # program -> Variable(learning_rate) or:
-        # program -> callable(return learning_rate Variable)
+        # program -> Variable(learning_rate)
         self._learning_rate_map = dict()
-        if isinstance(self._learning_rate, framework.Variable) or \
-            callable(self._learning_rate):
+        if isinstance(self._learning_rate, framework.Variable):
             self._learning_rate_map[framework.default_main_program(
             )] = self._learning_rate
         # Dictionary of accumulators. Some optimizer subclasses need to
@@ -79,11 +75,6 @@ class Optimizer(object):
 
         if isinstance(lr, framework.Variable):
             return
-        elif callable(lr):
-            dtype = 'float32' if self._dtype is None else self._dtype
-            self._learning_rate_map[framework.default_main_program()] = lr(
-                dtype)
-            return
         else:
             if not isinstance(self._learning_rate, float):
                 raise TypeError(
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..835376ffe78f9119a9be6c379998e3a3b50aab43
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from paddle.fluid.tests.unittests.test_fill_constant_op import TestFillConstantOp1, TestFillConstantOp2, TestFillConstantOpWithSelectedRows
+
+
+class TestNGRAPHFillConstantOp1(TestFillConstantOp1):
+    def setUp(self):
+        super(TestNGRAPHFillConstantOp1, self).setUp()
+
+
+class TestNGRAPHFillConstantOp2(TestFillConstantOp2):
+    def setUp(self):
+        super(TestNGRAPHFillConstantOp2, self).setUp()
+
+
+class TestNGRAPHFillConstantOpWithSelectedRows(
+        TestFillConstantOpWithSelectedRows):
+    def setUp(self):
+        super(TestFillConstantOpWithSelectedRows, self).setUp()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a0171087dce5d4c7b72eca7f7e4fb955af94812
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, TestTopkOp2, TestTopkOp3, TestTopkOp4
+
+
+class TestNGRAPHTopkOp(TestTopkOp):
+    def setUp(self):
+        super(TestNGRAPHTopkOp, self).setUp()
+
+
+class TestNGRAPHTopkOp2(TestTopkOp2):
+    def setUp(self):
+        super(TestNGRAPHTopkOp2, self).setUp()
+
+
+class TestNGRAPHTopkOp3(TestTopkOp3):
+    def setUp(self):
+        super(TestNGRAPHTopkOp3, self).setUp()
+
+
+class TestNGRAPHTopkOp4(TestTopkOp4):
+    def setUp(self):
+        super(TestNGRAPHTopkOp4, self).setUp()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c5e1abd7c8fb010357998c0ceaebaf21619fda9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestDeQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = 'dequantize'
+        self.scale = 2.0
+        self.input_size = [1, 1, 5, 5]  #Naive nChw16c
+        self.data_type = 'int8'
+        self.set_scale()
+        self.set_data_type()
+
+        if self.data_type == 'int8':
+            input = (np.random.randint(0, 100, self.input_size) - 50
+                     ).astype(self.data_type)
+            output = (input * (1 / self.scale)).astype('float')
+        else:
+            input = (np.random.randint(0, 100,
+                                       self.input_size)).astype(self.data_type)
+            output = (input * (1 / self.scale)).astype('float')
+
+        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(input)}
+
+        self.outputs = {'Output': output}
+
+        self.attrs = {'Scale': self.scale, }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def set_scale(self):
+        pass
+
+    def set_data_type(OpTest):
+        pass
+
+
+class TestDeQuantizeOp1(TestDeQuantizeOp):
+    def set_scale(self):
+        self.scale = 1.5
+
+    def set_data_type(self):
+        self.data_type = 'int8'
+
+
+class TestDeQuantizeOp2(TestDeQuantizeOp):
+    def set_scale(self):
+        self.scale = 0.8
+
+    def set_data_type(self):
+        self.data_type = 'uint8'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
index 021b950b3b6245caecab22d476bbb9d6b6b45c5e..6cd02dad577b681b8c452bdb9574df60ffb4f82e 100644
--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
@@ -29,7 +29,7 @@ class TestGetTensorFromSelectedRows(unittest.TestCase):
 
     def check_with_place(self, place):
         scope = core.Scope()
-        x_rows = [0, 5, 5, 4, 20]
+        x_rows = [0, 5, 5, 4, 19]
         height = 20
         row_numel = 2
 
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index e34a712d844c2d45f442d04f9350fbd7bc911a2a..0d3e6d73e0149fe633b8f1de9041068c2e3bb293 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -97,7 +97,7 @@ class TestLearningRateDecay(unittest.TestCase):
         startup_prog = fluid.Program()
 
         with fluid.program_guard(main_prog, startup_prog):
-            decayed_lr = fluid_decay_fn(**kwargs)("float32")
+            decayed_lr = fluid_decay_fn(**kwargs)
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
index ce64da0478d3997f4889ca942c67e0defac80b45..d2fa344b67ab33a93f92733efd68e896c767bad2 100644
--- a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
@@ -29,8 +29,8 @@ class TestMergeSelectedRows(unittest.TestCase):
 
     def check_with_place(self, place):
         scope = core.Scope()
-        x_rows = [0, 5, 5, 4, 20]
-        out_rows = [0, 4, 5, 20]
+        x_rows = [0, 5, 5, 4, 19]
+        out_rows = [0, 4, 5, 19]
         height = 20
         row_numel = 2
 
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..943ad3ed22480193dc51375cdcca5ed36ce35158
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle.fluid as fluid
+import paddle
+import unittest
+import six
+import numpy as np
+
+dev_cnt = 2
+if fluid.core.is_compiled_with_cuda():
+    dev_cnt = fluid.core.get_cuda_device_count()
+os.environ['CPU_NUM'] = str(dev_cnt)
+
+
+def dummy_func_with_no_input():
+    return float(1.0)
+
+
+def dummy_func_with_no_output(x):
+    pass
+
+
+def tanh(x):
+    return np.tanh(x)
+
+
+def tanh_grad(y, dy):
+    return np.array(dy) * (1 - np.square(np.array(y)))
+
+
+def cross_entropy(logits, labels):
+    logits = np.array(logits)
+    labels = np.array(labels)
+    M = logits.shape[0]
+    N = logits.shape[1]
+    ret = np.ndarray([M, 1]).astype(logits.dtype)
+    for idx in six.moves.range(M):
+        ret[idx][0] = -np.log(logits[idx][labels[idx][0]])
+    return ret
+
+
+def cross_entropy_grad(logits, labels, bwd_dout):
+    logits = np.array(logits)
+    labels = np.array(labels)
+    bwd_dout = np.array(bwd_dout)
+    M = logits.shape[0]
+    N = logits.shape[1]
+    dlogits = np.zeros([M, N]).astype(logits.dtype)
+    for idx in six.moves.range(M):
+        dlogits[idx][labels[idx][0]] = -bwd_dout[idx] / logits[idx][labels[idx][
+            0]]
+    return dlogits, None
+
+
+def simple_fc_net(img, label, use_py_func_op):
+    hidden = img
+    for idx in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+        if not use_py_func_op:
+            hidden = fluid.layers.tanh(hidden)
+        else:
+            new_hidden = fluid.default_main_program().current_block(
+            ).create_var(
+                name='hidden_{}'.format(idx),
+                dtype='float32',
+                shape=hidden.shape)
+            hidden = fluid.layers.py_func(
+                func=tanh,
+                x=hidden,
+                out=new_hidden,
+                backward_func=tanh_grad,
+                skip_vars_in_backward_input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    if not use_py_func_op:
+        loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    else:
+        loss = fluid.default_main_program().current_block().create_var(
+            name='loss', dtype='float32', shape=[-1, 1])
+        loss = fluid.layers.py_func(
+            func=cross_entropy,
+            x=[prediction, label],
+            out=loss,
+            backward_func=cross_entropy_grad,
+            skip_vars_in_backward_input=loss)
+
+        dummy_var = fluid.default_main_program().current_block().create_var(
+            name='test_tmp_var', dtype='float32', shape=[1])
+        fluid.layers.py_func(
+            func=dummy_func_with_no_input, x=None, out=dummy_var)
+
+        fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None)
+
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def reader():
+    for _ in six.moves.range(dev_cnt * 100):
+        yield np.random.random([784]), np.random.random_integers(
+            size=[1], low=0, high=9)
+
+
+def test_main(use_cuda, use_py_func_op, use_parallel_executor):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return None
+
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        with fluid.scope_guard(fluid.core.Scope()):
+            fluid.default_main_program().random_seed = 1
+            fluid.default_startup_program().random_seed = 1
+            np.random.seed(1)
+
+            img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            loss = simple_fc_net(img, label, use_py_func_op)
+            optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
+            optimizer.minimize(loss)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+            r = paddle.batch(reader, batch_size=10)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            if use_parallel_executor:
+                exe = fluid.ParallelExecutor(
+                    use_cuda=use_cuda, loss_name=loss.name)
+                fetch_list = [loss.name]
+            else:
+                fetch_list = [loss]
+
+            ret = []
+            for epoch_id in six.moves.range(2):
+                for d in r():
+                    L, = exe.run(feed=feeder.feed(d), fetch_list=fetch_list)
+                    ret.append(L)
+
+            return np.array(ret)
+
+
+class TestPyFuncOpUseExecutor(unittest.TestCase):
+    def setUp(self):
+        self.use_parallel_executor = False
+
+    def test_loss_diff(self):
+        losses = []
+        for use_cuda in [True, False]:
+            for use_py_func_op in [True, False]:
+                L = test_main(use_cuda, use_py_func_op,
+                              self.use_parallel_executor)
+                if L is not None:
+                    losses.append(L)
+
+        for idx in six.moves.range(len(losses) - 1):
+            max_diff = np.max(np.abs(losses[idx] - losses[0]))
+            self.assertAlmostEqual(max_diff, 0, delta=1e-3)
+
+
+class TestPyFuncOpUseParallelExecutor(unittest.TestCase):
+    def setUp(self):
+        self.use_parallel_executor = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..99607928648be437b7f944f86a0c28b99d1775c4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = 'quantize'
+        self.scale = 2.0
+        self.input_size = [1, 1, 5, 5]  #Naive nChw16c
+        self.is_negative = False
+        self.set_scale()
+        self.set_is_negative()
+
+        if self.is_negative:
+            input = (100 * np.random.random_sample(self.input_size) - 50
+                     ).astype('float32')
+            output = np.round(input * self.scale).astype('int8')
+        else:
+            input = (100 *
+                     np.random.random_sample(self.input_size)).astype('float32')
+            output = np.round(input * self.scale).astype('uint8')
+
+        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(input)}
+
+        self.outputs = {'Output': output}
+
+        self.attrs = {
+            'Scale': self.scale,
+            'is_negative_input': self.is_negative
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def set_scale(self):
+        pass
+
+    def set_is_negative(self):
+        pass
+
+
+class TestQuantizeOp1(TestQuantizeOp):
+    def set_scale(self):
+        self.scale = 1.5
+
+    def set_is_negative(self):
+        self.is_nagative = True
+
+
+class TestQuantizeOp2(TestQuantizeOp):
+    def set_scale(self):
+        self.scale = 0.1
+
+    def set_is_negative(self):
+        self.is_nagative = False
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
index 61ac8790112ceadfdef7b18aad70af77644581cd..0c201b9e4f48df94924a248d820ae2cf73367560 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
@@ -23,16 +23,6 @@ class TestTransposeMKLDNN(TestTransposeOp):
     def init_op_type(self):
         self.op_type = "transpose2"
         self.use_mkldnn = True
-        self.is_test = True
-        return
-
-    def test_check_grad(self):
-        return
-
-    def test_check_grad_no_input(self):
-        return
-
-    def test_check_grad_no_filter(self):
         return
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 93be9d28da7a73f4fa972acf0dbd95167e7dfca3..a38540a7240636415ef4703609c5a3e8e83ed1da 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -27,7 +27,6 @@ class TestTransposeOp(OpTest):
         self.attrs = {
             'axis': list(self.axis),
             'use_mkldnn': self.use_mkldnn,
-            'is_test': self.is_test,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float32"),
@@ -37,7 +36,6 @@ class TestTransposeOp(OpTest):
     def init_op_type(self):
         self.op_type = "transpose2"
         self.use_mkldnn = False
-        self.is_test = False
 
     def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
diff --git a/python/requirements.txt b/python/requirements.txt
index 2f81d85df0626b294f4d861706b5c1b7ec9841d5..03d5e33e88cd5f1138ca8f6a6e885d6acfbc260e 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -9,3 +9,5 @@ Pillow
 nltk>=3.2.2
 graphviz
 six
+funcsigs
+pyyaml
diff --git a/python/setup.py.in b/python/setup.py.in
index 5d5f2dd0f18cd3e707dca8b9f337f2f2a07d47aa..c9afe6c885658b88ac520aad2e7b13facda02a92 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,6 +109,10 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.slim',
+          'paddle.fluid.contrib.slim.core',
+          'paddle.fluid.contrib.slim.graph',
+          'paddle.fluid.contrib.slim.prune',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
@@ -140,8 +144,6 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
 package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
-if os.name == 'nt':
-    package_data['paddle.fluid'] += ['openblas' + ext_name]
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
@@ -166,11 +168,17 @@ package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') +
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 
 if '${WITH_MKL}' == 'ON':
-    shutil.copy('${MKLML_LIB}', libs_path)
-    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
-    package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
+    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
+    package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name]
+else:
+    if os.name == 'nt':
+        # copy the openblas.dll
+        shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path)
+        package_data['paddle.libs'] += ['openblas' + ext_name]
+
 if '${WITH_MKLDNN}' == 'ON':
-    if '${CMAKE_BUILD_TYPE}' == 'Release':
+    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
         # only change rpath in Release mode.
         # TODO(typhoonzero): use install_name_tool to patch mkl libs once
         # we can support mkl on mac.
@@ -181,7 +189,7 @@ if '${WITH_MKLDNN}' == 'ON':
         command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
         if os.system(command) != 0:
             raise Exception("patch libmkldnn.so failed, command: %s" % command)
-    package_data['paddle.libs']+=['libmkldnn.so.0']
+    package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)]
     shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 if '${WITH_NGRAPH}' == 'ON':
     # only change rpath in Release mode,