diff --git a/CMakeLists.txt b/CMakeLists.txt
index c31f51a3f7371bd7b1b0ca3234091a35868806ce..66dcef0013efb486b532f9ae17e9ae2040dc9e38 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,16 +126,12 @@ if(ANDROID OR IOS)
     add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 
-if (APPLE OR WIN32)
+if (APPLE)
     set(WITH_MKL OFF CACHE STRING
-        "Disable MKL for building on mac and windows" FORCE)
+        "Disable MKL for building on mac" FORCE)
 endif()
 
 if (WIN32)
-    set(WITH_DSO OFF CACHE STRING
-            "Disable DSO when compiling for Windows" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-            "Disable MKL when compiling for Windows" FORCE)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
     set(WITH_C_API OFF CACHE STRING
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 09bec347dbd569203103eccc7dbc0521c291bc0a..fb899e3d7cd4224acd25a559d0e18a09f552ad7d 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -44,9 +44,9 @@ if(WIN32)
 set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)
 
-if(Apple)
+if(APPLE)
 set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
-endif(Apple)
+endif(APPLE)
 
 find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index b280db23b9b27bc658a79d01ea81122d2c987666..c29375cd0589764507f9a10859a6d4fdbb29716e 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -23,15 +23,14 @@ SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
-IF(WIN32 OR APPLE)
+IF(APPLE)
     MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLDNN in Paddle yet."
+        "Mac is not supported with MKLDNN in Paddle yet."
         "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE)
     return()
 ENDIF()
 
-SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
 MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
@@ -44,10 +43,14 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
     MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
-SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
-SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+
+IF(NOT WIN32)
+    SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
+    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+ENDIF(NOT WIN32)
+
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -58,8 +61,15 @@ ExternalProject_Add(
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
     CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
@@ -67,6 +77,11 @@ ExternalProject_Add(
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                         -DMKLROOT:PATH=${MKLML_ROOT}
 )
+if(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
+else(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+endif(WIN32)
 
 ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
@@ -85,10 +100,14 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
-SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
-ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
-    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-    DEPENDS mkldnn)
+if(WIN32)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll)
+else(WIN32)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+            DEPENDS mkldnn)
+endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
 
 IF(WITH_C_API)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index dc5427acd45f5da90317e7a3dc25f5453e2a7a00..d49839a89d78803f0fad58192283deae47ad72ef 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,56 +16,67 @@ IF(NOT ${WITH_MKLML})
   return()
 ENDIF(NOT ${WITH_MKLML})
 
-IF(WIN32 OR APPLE)
+IF(APPLE)
     MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Mac is not supported with MKLML in Paddle yet."
         "Force WITH_MKLML=OFF")
     SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
     return()
 ENDIF()
 
 INCLUDE(ExternalProject)
-
-SET(MKLML_PROJECT       "extern_mklml")
-IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
-  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
-SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+if(WIN32)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+else()
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
+endif()
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 
-INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+    MESSAGE(STATUS "use pre defined download url")
+    if(WIN32)
+        SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    else()
+        SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    ENDIF()
+endif()
 
-FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(MKLML)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
-  "        DESTINATION ${MKLML_DST_DIR})\n")
+SET(MKLML_PROJECT       "extern_mklml")
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 
 ExternalProject_Add(
     ${MKLML_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${MKLML_SOURCE_DIR}
+    PREFIX                 ${MKLML_SOURCE_DIR}
+    URL                    ${MKLML_URL}
     DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
-                          && tar zxf ${MKLML_VER}.tgz
     DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} &&
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR}
 )
 
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a8b9dcfcf5eec39af0f59c03b1ed9bd4b71ee7bf..c6fe2e970d3e02985e3f2b8d5df6a7358beed514 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,7 +267,11 @@ function(cc_library TARGET_NAME)
           list(APPEND cc_library_DEPS dynload_mklml)
         endif()
         add_dependencies(${TARGET_NAME} mklml)
-        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        if(WIN32)
+          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
+        else(WIN32)
+          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        endif(WIN32)
       endif()
       # remove link to python, see notes at:
       # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 9f0adef7aa603ec5a3c8a5aa347613f462c43e60..48279bc809dde9e97c967c3ea5d03fbd7b89b017 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -115,20 +115,20 @@ if (NOT PROTOBUF_FOUND OR WIN32)
             )
 endif ()
 
-if (NOT CBLAS_FOUND)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
-    copy(openblas_lib
-            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-            DSTS ${dst_dir} ${dst_dir}
-            DEPS extern_openblas
-            )
-elseif (WITH_MKLML)
+if (WITH_MKLML)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
     copy(mklml_lib
             SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
             DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
             DEPS mklml
             )
+elseif (NOT CBLAS_FOUND OR WIN32)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
+    copy(openblas_lib
+            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS extern_openblas
+            )
 endif ()
 
 if (WITH_MKLDNN)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index e3b44499258d288fe5692ca23efe1c4ec234f75c..b6974c6af290438f827c16bb478eb43e3cf42247 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -351,6 +351,23 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None))
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 6c8bec32de2a8c1d59155b812c05d5181acb82be..8fbbc6584e121d22bdec8173d501a35dc97c9c06 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -157,13 +157,8 @@ bool CheckLoD(const LoD &in, int tensor_height) {
     if (level.size() < 2) return false;
     // check: the first offset(the begin offset) of each level should be 0.
     if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(no same items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      LOG(INFO) << "ascending error";
+    // check: all the offsets in a level should be ascending(allow same items)
+    if (!std::is_sorted(level.begin(), level.end())) {
       return false;
     }
   }
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index cd50aaa26054b78f1b1e8f0d470b397892155a2b..15928c18d38b8a513b00f993b57faab43978bf53 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -217,6 +217,11 @@ TEST(LoD, CheckLoD) {
   // check with underlying tensor storage.
   ASSERT_TRUE(CheckLoD(relative_lod, 5));
   ASSERT_FALSE(CheckLoD(relative_lod, 9));
+
+  // check whether lod is ascending-sorted (allow same items)
+  ASSERT_TRUE(CheckLoD({{0, 1, 2, 3, 4, 5}}, 5));
+  ASSERT_TRUE(CheckLoD({{0, 1, 3, 3, 4, 5}}, 5));
+  ASSERT_FALSE(CheckLoD({{0, 1, 3, 2, 5}}, 5));
 }
 
 TEST(LoD, CheckAbsLoD) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 4b520a393f2ed217feb18937684d5feeea0923b9..fec311e3ee3aa94bbd640a8d4a85840d96b3af43 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -476,6 +476,28 @@ const Tensor* ExecutionContext::LegacyInput<Tensor>(
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  if (it == ctx_.inputs.end()) {
+    return {};
+  }
+  const std::vector<Variable*>& vars = it->second;
+  std::vector<const Tensor*> res;
+  res.reserve(vars.size());
+  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                 [&](Variable* var) -> const Tensor* {
+                   if (var == nullptr) return nullptr;
+                   PADDLE_ENFORCE(
+                       var->IsType<LoDTensor>(),
+                       "should be LoDTensor, but the received type is %s",
+                       var->Type().name());
+                   return &(var->Get<LoDTensor>());
+                 });
+  return res;
+}
+
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const {
   auto names = op().Inputs(name);
   std::vector<const Tensor*> res;
   res.reserve(names.size());
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 39190d07b4ccdd5ffd03e2d50bb0e577ac00af75..1fe2daacf1369902cde732422b4e65c3d156250f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -197,8 +197,31 @@ class ExecutionContext {
 
   const std::vector<const Variable*> MultiInputVar(
       const std::string& name) const {
-    auto names = op_.Inputs(name);
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
     std::vector<const Variable*> res;
+    res.reserve(it->second.size());
+    std::transform(it->second.begin(), it->second.end(),
+                   std::back_inserter(res),
+                   [this](Variable* var) { return var; });
+    return res;
+  }
+
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    return it->second;
+  }
+
+  const std::vector<Variable*> LegacyMultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<Variable*> res;
     res.reserve(names.size());
     std::transform(names.begin(), names.end(), std::back_inserter(res),
                    [this](const std::string& name) {
@@ -208,7 +231,7 @@ class ExecutionContext {
     return res;
   }
 
-  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
     auto names = op_.Outputs(name);
     std::vector<Variable*> res;
     res.reserve(names.size());
@@ -250,6 +273,38 @@ class ExecutionContext {
 
   template <typename T>
   const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<const T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> const T* {
+                     return var == nullptr ? nullptr : &var->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<T*> MultiOutput(const std::string& name) const {
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> T* {
+                     return var == nullptr ? nullptr : var->GetMutable<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
     auto names = op_.Inputs(name);
     std::vector<const T*> res;
     res.reserve(names.size());
@@ -262,7 +317,7 @@ class ExecutionContext {
   }
 
   template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
+  std::vector<T*> LegacyMultiOutput(const std::string& name) const {
     auto names = op_.Outputs(name);
     std::vector<T*> res;
     res.reserve(names.size());
@@ -321,6 +376,10 @@ template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const;
 
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const;
+
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
 
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 57335847a1931de6599560c6e9395a910282b0ee..5b09cad06c3f87ce29a8c986d30217099bd10d74 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 
 namespace paddle {
 namespace framework {
@@ -27,6 +28,9 @@ void Tensor::check_memory_size() const {
       "or maybe the required data-type mismatches the data already stored.");
 }
 
+Tensor::Tensor(std::type_index type)
+    : type_(framework::ToDataType(type)), offset_(0) {}
+
 size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
@@ -101,5 +105,12 @@ const DDim& Tensor::dims() const { return dims_; }
 
 int64_t Tensor::numel() const { return product(dims_); }
 
+void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
+  if (holder_) {
+    PADDLE_ENFORCE_EQ(numel() * SizeOfType(type()), holder->size());
+  }
+  holder_ = holder;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6a1cbe5cd567429c922156f8bce7ca710b15a0f5..2e110133a33ede5c58779f9f7c52abd8e74c2fa0 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,6 +69,8 @@ class Tensor {
  public:
   Tensor() : type_(proto::VarType::FP32), offset_(0) {}
 
+  explicit Tensor(std::type_index type);
+
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   T* data();
@@ -162,6 +164,8 @@ class Tensor {
     return std::move(holder_);
   }
 
+  void ResetHolder(std::shared_ptr<memory::Allocation> holder);
+
  private:
   /*! holds the memory block if allocated. */
   std::shared_ptr<memory::Allocation> holder_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index c751e8515829d06970c55f097f50de8bf33ee2a4..3937884ce4a5a16a1093ac8977033eaa98b2678e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                   inputs[i].data.length());
     } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
       auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
       memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
-                   0);  // stream 0 for sync copy
+                   inputs[i].data.length(), dev_ctx->stream());
 #else
       PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 3d121e046004dfe6fc6953e0b23852b9ecda5c1b..102147a493ed1454db1a78124200f163f68e555b 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                   inputs[i].data.length());
     } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
       auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
       memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                    platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
-                   0);  // stream 0 for sync copy
+                   inputs[i].data.length(), dev_ctx->stream());
 #else
       PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 8d0d96d391efd7f0f11e9d48f5a6221431bd3824..f42ee9a697bfb4b8fefd4d3ba65afea4e74f0a85 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -89,12 +89,21 @@ endif()
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if(NOT WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+                 ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX}
+            ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif(WIN32)
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
     include_directories("${MKLDNN_PATH}/include")
-    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    if(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+    else(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    endif(WIN32)
   endif()
 else()
   set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 249f308c13ff5636fbaa6747b28cab7886b7e736..4a7b31c7d491f0e4b73e2b574456d1567b7cc5dc 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
 
 namespace paddle {
 namespace operators {
@@ -123,6 +124,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
     const int batch_size = static_cast<int>(input->dims()[0]);
 
     // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
@@ -155,13 +158,19 @@ class GemmConvKernel : public framework::OpKernel<T> {
     // to call the matrix multiplication interface.
     Tensor col_matrix;
     if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      Tensor tep_tensor =
+          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
+
+      col.ShareDataWith(tep_tensor);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
 
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+    framework::DDim input_shape =
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
 
     framework::DDim filter_matrix_shape = {filter.dims()[0],
                                            filter.numel() / filter.dims()[0]};
@@ -178,7 +187,6 @@ class GemmConvKernel : public framework::OpKernel<T> {
     math::Vol2ColFunctor<DeviceContext, T> vol2col;
     math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
 
-    auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     for (int i = 0; i < batch_size; i++) {
       Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -237,6 +245,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
     // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
     // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
@@ -262,8 +272,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     framework::DDim col_matrix_shape =
         framework::flatten_to_2d(col_shape, data_dim + 1);
 
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+    framework::DDim input_shape =
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
 
     framework::DDim filter_matrix_shape = {filter.dims()[0],
                                            filter.numel() / filter.dims()[0]};
@@ -286,13 +296,18 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     // to call the matrix multiplication interface.
     Tensor col_matrix;
     if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      Tensor tep_tensor =
+          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
+
+      col.ShareDataWith(tep_tensor);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
 
     math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     if (input_grad) {
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
index 6a92762896b89a06a91cd11fb38587f7df69e6c3..acd5993154ed03f206f20082231feb5059ef32e1 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     vars->mutable_data<T>(ctx.GetPlace());
 
     framework::Tensor d_temp;
-    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+    framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
 
     // At least use 32 threads, at most 512 threads.
     // blockx is multiple of 32.
     int blockx = std::min(
-        static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L);
+        static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
+        512L);
     int gridx = (feature_width * num_priors + blockx - 1) / blockx;
     dim3 threads(blockx, 1);
     dim3 grids(gridx, feature_height);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index ec85fb80f4852cc6de1e8aeda86f0e98c9e1470a..4c73a70ed1ce2435bfc1a0f3d45afe9b6e3c4cf6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -16,11 +16,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
-#include "paddle/fluid/operators/jit/kernels.h"
+#ifdef PADDLE_WITH_XBYAK
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -81,8 +84,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
     UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
 
-    Xbyak::util::Cpu cpu;
-    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
+    const bool is_avx512_enabled = platform::MayIUse(platform::avx512f);
     const bool are_dims_divisable = !(x_int_dims[1] % 16);
     const bool is_x_format_correct = x->format() == memory::format::nChw16c;
     const bool is_y_format_correct = y->format() == memory::format::nc;
diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
index ced29741253e72a17413de51fb2c24a7fb1257d3..262094f9224407bb412f5b189a748efe13cb04b2 100644
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -21,5 +21,5 @@ endif()
 cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
 cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
 if(NOT WIN32)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer)
 endif()
diff --git a/paddle/fluid/operators/jit/README.en.md b/paddle/fluid/operators/jit/README.en.md
new file mode 100644
index 0000000000000000000000000000000000000000..8670ec2ff28ac8353217e0ee2f8c9b784e488ac7
--- /dev/null
+++ b/paddle/fluid/operators/jit/README.en.md
@@ -0,0 +1,76 @@
+# JIT Kernel
+
+JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
+Each implementations has its own condition to use, defined in `UseMe`.
+They are combined together to get the best performance of one single independent function.
+They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
+And they can be composed with some other exited jit kernels to build up a complex function. 
+Currently it's only supported on CPU yet.
+
+## Contents
+
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+
+All basical definations of jit kernels are addressed in `paddle/fluid/operators/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality.
+
+- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries.
+- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak.
+- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. 
+
+## How to use
+
+One simple function `jit::Get`, which is very easy to use, is supported to get the kernel.
+It can automatically return the expected function with best performance under the given attributes. 
+All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, you can only include this one header to get all the registered kernels.
+
+## Solid Test
+
+- Unit Test
+    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
+- Benchmark
+    All functions should be tested, and make sure the `jit::Get` function obtain the best performance with all attributes.
+
+# How to add new kernel
+
+## Required
+
+1. Add `your_key` at `KernelType`.
+2. Add reference function of `your_key`. 
+Note:
+    - this should be run on CPU and do not depend on any third-party.
+    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+3. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
+Test more data type for some special functions if necessary, for example `int8`.
+4. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `jit::Get` always get the best one.
+
+## Optional
+
+Add more implementations of `your_kery` for performance enhancement.
+
+1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corepsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
+Note: Add new `KernelTuples` if necessary，your can refer to `XYZNTuples`.
+Specialie method `JitCodeKey` when add new attribute type。
+2. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md
index 89180b5900d760ce1da5bf0de879301e052db63a..cc19f09f56ddf6a7c74d6605ab3f1bd059f19bb8 100644
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
@@ -10,26 +10,26 @@
 ```txt
 PaddlePaddle/Paddle/paddle/fluid/
 ├── ...
-├── operator/
-│   ├── .../
-└── jit/
-    ├── ...
-    ├── gen/
-    │   └── ...
-    |── more/
-    │   ├── ...
-    │   ├── mkl/
-    │   │   └── ...
-    │   ├── mkldnn/
-    │   │   └── ...
-    │   ├── mix/
-    │   │   └── ...
-    │   ├── intrinsic/
-    │   │   └── ...
-    │   └── openblas/
-    │       └── ...
-    └── refer/
-        └── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
 ```
 
 基本类的定义都放在根目录下，根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现，每种kernel算子都需要有reference的实现，用作单元测试的基准，其他的实现都是可选的。
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index 48855abd267687b0f3c092279c1f29cc9fb1da40..4af01a437670aa6a07d370ff23ed2abd369f69a3 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -36,6 +36,8 @@ class GenBase : public Kernel {
     if (FLAGS_dump_jitcode) {
       this->dumpCode(code);
     }
+    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
+    // then workaround with const_cast. Any better idea is appreciated.
     return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
   }
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 760a065c1081d1e55901774b258ba524471b856b..b10a19b658e383b8c7b4fbbe8f90da1fe0d4fd14 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -131,9 +131,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int in_col = input[0].numel() / in_row;
     int out_row = in_row, out_col = 0;
 
-    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_col(in_num + 1);
-    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+    std::vector<T*> inputs_data(in_num);
+    std::vector<int> inputs_col(in_num + 1);
 
     inputs_col[0] = 0;
     bool sameShape = true;
@@ -144,12 +143,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
       }
       out_col += t_cols;
       inputs_col[i + 1] = out_col;
-      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
+      inputs_data[i] = const_cast<T*>(input[i].data<T>());
     }
 
-    T** dev_ins_data =
-        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-
     // computation
     // set the thread block and grid according to CurrentDeviceId
     const int kThreadsPerBlock = 1024;
@@ -169,18 +165,32 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
         std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
+    auto tmp_dev_ins_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            inputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_ins_data->ptr(), platform::CPUPlace(),
+                 static_cast<void*>(inputs_data.data()),
+                 inputs_data.size() * sizeof(T*), context.stream());
+    T** dev_ins_data = reinterpret_cast<T**>(tmp_dev_ins_data->ptr());
+
     if (sameShape) {
       ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
-      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
+      auto tmp_dev_ins_col_data =
+          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+              inputs_col.size() * sizeof(int));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                   static_cast<void*>(inputs_col.data()),
+                   inputs_col.size() * sizeof(int), context.stream());
+      int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data->ptr());
+
       ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
           out_row, out_col, output->data<T>());
     }
-    // Wait() must be called because `inputs_data` may be destructed before
-    // kernel ends
-    context.Wait();
   }
 };
 
@@ -207,9 +217,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     int in_col = 0, in_row = out_row;
     bool sameShape = true;
 
-    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(o_num + 1);
-    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
+    std::vector<T*> outputs_data(o_num);
+    std::vector<int> outputs_cols(o_num + 1);
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
@@ -220,15 +229,12 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       in_col += t_col;
       outputs_cols[i + 1] = in_col;
       if (outputs->at(i) != nullptr) {
-        outputs_ptr[i] = outputs->at(i)->data<T>();
+        outputs_data[i] = outputs->at(i)->data<T>();
       } else {
-        outputs_ptr[i] = nullptr;
+        outputs_data[i] = nullptr;
       }
     }
 
-    T** dev_out_gpu_data =
-        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-
     // computation
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
@@ -247,18 +253,33 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
         std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
+    auto tmp_dev_outs_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            outputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_outs_data->ptr(), platform::CPUPlace(),
+                 reinterpret_cast<void*>(outputs_data.data()),
+                 outputs_data.size() * sizeof(T*), context.stream());
+    T** dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+
     if (sameShape) {
       SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
-      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
+      auto tmp_dev_ins_col_data =
+          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+              outputs_cols.size() * sizeof(int));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                   reinterpret_cast<void*>(outputs_cols.data()),
+                   outputs_cols.size() * sizeof(int), context.stream());
+      int* dev_outs_col_data =
+          reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
+
       SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
           static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
-    // Wait() must be called because `outputs_data` may be destructed before
-    // kernel ends
-    context.Wait();
   }
 };
 
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index ccbd05c82ad6a880d21269092088be9656b35c99..2e3779ff0845294e71f27801049c010e0a585e6b 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 
+#if defined(_WIN32)
+#if defined(__AVX2__) || defined(__AVX__)
+inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
+#endif
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 83bb4dde46fa241affad3788e3381b6ecd8aa098..08088eb8733f28f0dc8ecade2aa4b70342244b0a 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -92,8 +92,8 @@ template <typename T>
 class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& place = *dev_ctx.eigen_device();
     // get input and output tensor
     auto* predictions = ctx.Input<Tensor>("Predictions");
     auto* labels = ctx.Input<Tensor>("Labels");
@@ -115,11 +115,11 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
     auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
 
-    // Temporary tensor
-    Tensor ious;
-    float* ious_data = ious.mutable_data<float>(
-        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
-    auto ious_t = EigenTensor<float, 1>::From(ious);
+    // Temporary memory
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto tmp_ious_data = allocator.Allocate(num_classes * sizeof(float));
+    float* ious_data = static_cast<float*>(tmp_ious_data->ptr());
 
     // Init out_wrong, out_correct and out_mean_iou
     out_wrong_t.device(place) = out_wrong_t.constant(0);
@@ -148,7 +148,7 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
         num_classes, predictions->numel(), predictions_data, labels_data,
         out_wrong_data, out_correct_data);
-    ctx.device_context().Wait();
+
     ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
                                                   out_correct_data, ious_data,
                                                   out_mean_iou_data);
diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/transpose_mkldnn_op.cc
index 2f133c9e251388e9e78a6a49ca66a45a56eef76e..e6df7028f540d0928e2bb0763bd4cfef12059665 100644
--- a/paddle/fluid/operators/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
@@ -29,10 +29,6 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    PADDLE_ENFORCE(
-        is_test == true,
-        "TransposeMKLDNN works only for inference!. Set is_test = True");
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -68,6 +64,57 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    if (!x_grad) return;
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    int ndims = axis.size();
+    if (ndims == 1) {
+      x_grad->ShareDataWith(*out_grad);
+      return;
+    }
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+
+    const T* out_grad_data = out_grad->data<T>();
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> nchw_tz =
+        paddle::framework::vectorize2int(out_grad->dims());
+
+    const std::string key = platform::TransposeMKLDNNHandler::GetHash(
+        nchw_tz, axis, ctx.op().Output(framework::GradVarName("X")));
+
+    platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
+                                             mkldnn_engine, key);
+
+    auto transpose_src_memory_p = handler.AcquireSrcMemory(
+        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
+    auto transpose_dst_memory_p =
+        handler.AcquireDstMemory(x_grad, ctx.GetPlace());
+    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
+                                                transpose_src_memory_p);
+
+    std::vector<mkldnn::primitive> pipeline;
+    pipeline.push_back(*transpose_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -77,3 +124,8 @@ REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL(transpose_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNGradOpKernel<float>);
+REGISTER_OP_KERNEL(transpose2_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index b3b379d16ff099ba244fc92ed149a0089c2750e4..db14d350c7d92629873dfc5bc9181f651582e47c 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -79,10 +79,6 @@ class TransposeOp : public framework::OperatorWithKernel {
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
     AddInput(
         "X",
         "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
@@ -147,6 +143,24 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace(), layout_, library_);
+  }
 };
 
 // FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
@@ -237,9 +251,19 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
     return framework::OpKernelType(
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
+        ctx.GetPlace(), layout_, library_);
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 2f205e1d5ca30d67a55e4df0f5e879ffef9a9c26..d1dff16ddd859e6bf19ec22420c28819a9f14d50 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,6 +56,8 @@ ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
 
+cc_library(temp_allocator SRCS temporary_allocator.cc DEPS  allocator_facade)
+
 nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
 IF(WITH_GPU)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
@@ -66,7 +68,8 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}  temp_allocator)
+
 if(WIN32)
     if(WITH_GPU AND NOT WITH_DSO)
         get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
@@ -92,3 +95,9 @@ IF(WITH_GPU)
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
 ENDIF()
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+
+if(WITH_GPU)
+    nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+else()
+    cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+endif()
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index f9a32bfa4c15261ba6b79fc4efd3a1961f7c6d4d..9d5ae813de0f5861d7bc97c9dc2885d91b7240fb 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
-
 #elif defined(_WIN32)
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #include <windows.h>
diff --git a/paddle/fluid/platform/create_tensor_with_allocationptr.h b/paddle/fluid/platform/create_tensor_with_allocationptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..00fcc5f86209b2a827ac070773f4b0049b0457d8
--- /dev/null
+++ b/paddle/fluid/platform/create_tensor_with_allocationptr.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
+namespace paddle {
+namespace platform {
+
+template <typename T>
+paddle::framework::Tensor GetTensor(
+    memory::allocation::AllocationPtr temp_allocation_ptr,
+    const framework::DDim &dim) {
+  auto &deleter = temp_allocation_ptr.get_deleter();
+  auto *allocation_ptr = temp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
+
+  PADDLE_ENFORCE(dynamic_cast<TemporaryAllocation *>(allocation_ptr) != nullptr,
+                 "The AllocationPtr must be TemporaryAllocation.");
+  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                    framework::product(dim) * sizeof(T));
+
+  paddle::framework::Tensor temp_tensor(std::type_index(typeid(T)));
+  temp_tensor.Resize(dim);
+  temp_tensor.ResetHolder(std::move(shared_allocation));
+  return temp_tensor;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index d2e23d80f437e1df9216fa36e99a9be394dda074..81c443d758fcf22545af4bf8e452be8f0ecc0a89 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -85,6 +85,49 @@ DeviceContextPool::DeviceContextPool(
   }
 }
 
+DeviceTemporaryAllocator* DeviceTemporaryAllocator::allocators = nullptr;
+
+#ifdef PADDLE_WITH_CUDA
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::Place& place, const cudaStream_t& stream) {
+  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  auto place_stream = std::make_pair(place, stream);
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    if (!device_allocator_.count(place_stream)) {
+      device_allocator_[place_stream].reset(new TemporaryAllocator(place));
+      device_allocator_[place_stream]->SetCallback([stream]() {
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+        PADDLE_ENFORCE(cudaGetLastError());
+      });
+    }
+  }
+  return *device_allocator_.at(place_stream);
+}
+
+template <>
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::CUDADeviceContext& dev_ctx) {
+  auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream());
+  if (device_allocator_.count(place_stream)) {
+    return *device_allocator_.at(place_stream);
+  }
+  return Get(dev_ctx.GetPlace(), dev_ctx.stream());
+}
+#endif
+
+template <>
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::CPUDeviceContext& dev_ctx) {
+  return cpu_allocator_;
+}
+
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::Place& place) {
+  PADDLE_ENFORCE(platform::is_cpu_place(place), "You should pass CPUPlace");
+  return cpu_allocator_;
+}
+
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
@@ -271,8 +314,12 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-  PADDLE_ENFORCE(cudaGetLastError());
+  auto& allocator =
+      DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this);
+  allocator.Release([=]() {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+    PADDLE_ENFORCE(cudaGetLastError());
+  });
 }
 
 int CUDADeviceContext::GetComputeCapability() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 812e56f1f966d03207cf83ad47cb88e9fa5d55bb..af9744dcb847f8af97e87cc18d2aee376f3f3d6c 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
@@ -39,6 +41,50 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+/*! \brief device temporary allocator singleton */
+class DeviceTemporaryAllocator {
+ public:
+  static DeviceTemporaryAllocator& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(allocators,
+                            "Need to Create DeviceTemporaryAllocator first!");
+    return *allocators;
+  }
+
+  static DeviceTemporaryAllocator& Init() {
+    if (allocators == nullptr) {
+      allocators = new DeviceTemporaryAllocator();
+    }
+    return *allocators;
+  }
+
+/*! \brief  Return handle of single temporary allocator. */
+#ifdef PADDLE_WITH_CUDA
+  platform::TemporaryAllocator& Get(const platform::Place& place,
+                                    const cudaStream_t& stream);
+#endif
+  template <typename DeviceContext>
+  platform::TemporaryAllocator& Get(const DeviceContext& dev_ctx);
+
+  platform::TemporaryAllocator& Get(const platform::Place& place);
+
+ private:
+  DeviceTemporaryAllocator() : cpu_allocator_(platform::CPUPlace()) {}
+
+  static DeviceTemporaryAllocator* allocators;
+
+  platform::TemporaryAllocator cpu_allocator_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::map<std::pair<platform::Place, cudaStream_t>,
+           std::unique_ptr<platform::TemporaryAllocator>>
+      device_allocator_;
+#endif
+
+  std::mutex mtx_;
+
+  DISABLE_COPY_AND_ASSIGN(DeviceTemporaryAllocator);
+};
+
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index eddebfe92ae80be7e70090aca041df1c6ea4cd11..990e44cd211c001c436dce8ff74a89a5516b38ae 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -227,6 +227,8 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
 #endif
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0d10d82d74a2011b1b2bc088fe88cbfdb49600b8..ac86b38a61c9d8e3e946d9fb3f46d8feba7c034d 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -110,7 +110,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   }
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
-
+  platform::DeviceTemporaryAllocator::Init();
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0be017f75bcc8aff5073ebb2c5179cf7250be8b9
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/temporary_allocator.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+DEFINE_double(limit_of_temporary_allocation, -1,
+              "The up limit of temporary_allocation size.");
+
+namespace paddle {
+namespace platform {
+namespace alloc = memory::allocation;
+
+TemporaryAllocation::TemporaryAllocation(
+    alloc::AllocationPtr &&underlying_allocation)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)) {}
+
+TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
+  temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+}
+
+bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
+
+void TemporaryAllocator::Release(const std::function<void()> &callback) {
+  std::shared_ptr<std::deque<TemporaryAllocation *>> t_allocations;
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    callback();
+    t_allocations = temp_mem_queue_;
+    temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+    wait_delete_mem_ = 0;
+  }
+  for (auto tmp : *t_allocations) {
+    VLOG(10) << "Delete temporary allocation " << tmp->ptr()
+             << " size: " << tmp->size();
+    delete tmp;
+  }
+}
+
+void TemporaryAllocator::Free(alloc::Allocation *allocation) {
+  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
+  if (platform::is_gpu_place(temp_allocation->place())) {
+    size_t wait_delete_mem = 0;
+    {
+      std::unique_lock<std::mutex> lock(mtx_);
+      temp_mem_queue_->emplace_back(temp_allocation);
+      wait_delete_mem_ += temp_allocation->size();
+      wait_delete_mem = wait_delete_mem_;
+      VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr()
+               << " to delete queue: " << temp_allocation->size() << "; "
+               << "wait_delete_mem: " << wait_delete_mem_;
+    }
+    if (FLAGS_limit_of_temporary_allocation > 0 &&
+        wait_delete_mem > FLAGS_limit_of_temporary_allocation) {
+      Release(callback_);
+    }
+    return;
+  }
+  delete temp_allocation;
+}
+
+size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
+  std::unique_lock<std::mutex> lock(mtx_);
+  return temp_mem_queue_ ? temp_mem_queue_->size() : 0;
+}
+
+void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
+  callback_ = callback;
+}
+
+alloc::Allocation *TemporaryAllocator::AllocateImpl(
+    size_t size, alloc::Allocator::Attr attr) {
+  auto raw_allocation =
+      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
+  VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
+  return temp_mem;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e32d2d6959e69c94e869491ef8d11708870f7df
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <condition_variable>  // NOLINT
+#include <deque>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+namespace paddle {
+namespace platform {
+
+class TemporaryAllocation : public memory::allocation::Allocation {
+ public:
+  explicit TemporaryAllocation(
+      memory::allocation::AllocationPtr &&underlying_allocation);
+
+  memory::allocation::AllocationPtr underlying_allocation_;
+};
+
+class TemporaryAllocator : public memory::allocation::Allocator {
+ public:
+  explicit TemporaryAllocator(platform::Place place);
+
+  void Release(const std::function<void()> &callback);
+
+  size_t TemporaryAllocationQueueSize();
+
+  bool IsAllocThreadSafe() const override;
+
+  void SetCallback(const std::function<void()> &callback);
+
+ protected:
+  void Free(memory::allocation::Allocation *allocation) override;
+
+  memory::allocation::Allocation *AllocateImpl(
+      size_t size, memory::allocation::Allocator::Attr attr) override;
+
+ private:
+  platform::Place place_;
+
+  // When the allocation is not held by any variable, it should be placed
+  // to temp_mem_queue immediately.
+  std::shared_ptr<std::deque<TemporaryAllocation *>> temp_mem_queue_{nullptr};
+
+  std::mutex mtx_;
+  size_t wait_delete_mem_{0};
+  std::function<void()> callback_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b940b0e8243c0ae1e0eeb3a2c13f3d16c228925
--- /dev/null
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/temporary_allocator.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
+DECLARE_double(limit_of_temporary_allocation);
+
+namespace paddle {
+namespace platform {
+
+TEST(temporary_allocator, temporary_allocator) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator alloc(cpu_place);
+  alloc.Allocate(100);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  auto allocation = gpu_alloc.Allocate(101);
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+
+  {
+    auto allocation = gpu_alloc.Allocate(102);
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  }
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+TEST(temporary_allocator, add_callback) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_limit_of_temporary_allocation = 10;
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx =
+      static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
+  auto stream = dev_ctx->stream();
+  bool deleted = false;
+  gpu_alloc.SetCallback([stream, &deleted]() {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+    PADDLE_ENFORCE(cudaGetLastError());
+    deleted = true;
+  });
+  { gpu_alloc.Allocate(100); }
+  PADDLE_ENFORCE(deleted);
+  FLAGS_limit_of_temporary_allocation = -1;
+#endif
+}
+
+TEST(temporary_allocator, create_tensor_with_allocationptr) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator cpu_alloc(cpu_place);
+  {
+    size_t memory_size = 200;
+    auto allocation = cpu_alloc.Allocate(memory_size);
+    void* address = allocation->ptr();
+    int numel = memory_size / sizeof(float);
+    framework::Tensor tensor =
+        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+
+  {
+    size_t memory_size = 300;
+    auto allocation = gpu_alloc.Allocate(memory_size);
+    void* address = allocation->ptr();
+    int numel = memory_size / sizeof(float);
+    framework::Tensor tensor =
+        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+  }
+
+  // The allocation is not holded now, it should be placed to
+  // TemporaryAllocationQueue.
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+TEST(temporary_allocator, create_tensor_with_allocationptr2) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator cpu_alloc(cpu_place);
+  {
+    size_t memory_size = 400;
+    int numel = memory_size / sizeof(float);
+
+    framework::Tensor out_side_tensor;
+    void* address;
+    {
+      auto allocation = cpu_alloc.Allocate(memory_size);
+      address = allocation->ptr();
+      framework::Tensor tensor = GetTensor<float>(
+          std::move(allocation), framework::make_ddim({numel}));
+      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+
+      out_side_tensor.ShareDataWith(tensor);
+    }
+    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
+    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  {
+    void* address;
+    size_t memory_size = 500;
+    int numel = memory_size / sizeof(float);
+    framework::Tensor out_side_tensor;
+    {
+      auto allocation = gpu_alloc.Allocate(memory_size);
+      address = allocation->ptr();
+      framework::Tensor tensor = GetTensor<float>(
+          std::move(allocation), framework::make_ddim({numel}));
+      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+
+      out_side_tensor.ShareDataWith(tensor);
+    }
+    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
+    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
+    // The allocation is holded by out_side_tensor.
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+    gpu_alloc.Release([]() {});
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  }
+
+  // The allocation is not holded now, it should be placed to
+  // TemporaryAllocationQueue.
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+
+}  //  namespace platform
+}  //  namespace paddle
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
index eabb51d370aff709e289e1fc727aa2dbb551d82e..af033fa7407b8a81ebb162a2edff2fc41f8f5260 100644
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -35,16 +35,26 @@ add_executable(demo_trainer demo_trainer.cc)
 
 if(WITH_MKLDNN)
   include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-endif()
+  if(WIN32)
+    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
+  else(WIN32)
+    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
+  endif(WIN32)
+endif(WITH_MKLDNN)
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+  if(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
+  else(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+  endif(WIN32)
 else()
   if(APPLE)
     set(MATH_LIB cblas)
-  else(APPLE)
+  elseif(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
+  else()
     set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
   endif(APPLE)
 endif()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 139176b0d6c5dff511a97c9ac01f09e72a90306b..72c0d03e52246615d731719a7651010a4ede7e05 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -48,18 +48,13 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 IF(WIN32)
     # Python would use the .pyd by default under Windows series platform
     set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
-    get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
     set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
-    add_custom_command(OUTPUT ${FLUID_CORE}
-            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-            COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
-            DEPENDS paddle_pybind)
 ELSE()
     set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
-    add_custom_command(OUTPUT ${FLUID_CORE}
-            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-            DEPENDS paddle_pybind)
 ENDIF()
+add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+        DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
 IF(WIN32)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index ece97b661fd7d60f8822439a84ee4403b9e3d81c..24621110b18f63779da14edc42765aae3bf4abd6 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,6 +22,8 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import slim
+from .slim import *
 from . import utils
 from .utils import *
 
@@ -30,4 +32,5 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += slim.__all__
 __all__ += utils.__all__
diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..22dbf7c8b6bb2da7c310a20bdcbaffca248575b0
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/__init__.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import *
+from .graph import *
+from .prune import *
+__all__ = [
+    'build_compressor',
+    'CompressPass',
+    'ImitationGraph',
+    'SensitivePruneStrategy',
+    'MagnitudePruner',
+    'RatioPruner',
+]
diff --git a/python/paddle/fluid/contrib/slim/core/__init__.py b/python/paddle/fluid/contrib/slim/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7826d5830a6f7f6d42cb1275c2289695c080e52f
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import config
+from .config import *
+from . import compress_pass
+from .compress_pass import *
+from . import strategy
+from .strategy import *
+from . import pass_builder
+from .pass_builder import *
+
+__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__
diff --git a/python/paddle/fluid/contrib/slim/core/compress_pass.py b/python/paddle/fluid/contrib/slim/core/compress_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c348b878a1df43d7fb909f506c8cf65366866f
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/compress_pass.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core import CPUPlace
+from ..graph import get_executor
+
+__all__ = ['Context', 'CompressPass']
+
+
+class Context(object):
+    """
+    The context in the process of compression.
+    Args:
+        exe: The executor used to execute graph.
+        graph: The graph to be compressed.
+        scope: The scope used to execute graph.
+        program_exe: The program_exe is used to execute the program
+                     created for modifying the variables in scope.
+    """
+
+    def __init__(self, exe, graph, scope, program_exe=None):
+        # The total number of epoches to be trained.
+        self.epoch = 0
+        # Current epoch
+        self.epoch_id = 0
+        # Current batch
+        self.batch_id = 0
+        self.exe = exe
+        self.graph = graph
+        self.scope = scope
+        self.program_exe = program_exe
+
+
+class CompressPass(object):
+    """
+    The pass used to compress model.
+    Args:
+        place: The device used in compression.
+        data_reader: The data_reader used to run graph.
+        data_feeder: The data_feeder used to run graph.
+        scope: The scope used to run graph.
+        metrics: The metrics for evaluating model.
+        epoch: The total epoches of trainning in compression.
+        program_exe: The program_exe is used to execute the program
+                     created for modifying the variables in scope.
+    """
+
+    def __init__(self,
+                 place=None,
+                 data_reader=None,
+                 data_feeder=None,
+                 scope=None,
+                 metrics=None,
+                 epoch=None,
+                 program_exe=None):
+        self.strategies = []
+        self.place = CPUPlace() if place is None else place
+        self.data_reader = data_reader
+        self.data_feeder = data_feeder
+        self.scope = scope
+        self.metrics = metrics
+        self.epoch = epoch
+        self.program_exe = program_exe
+
+    def add_strategy(self, strategy):
+        """
+        Add a strategy to current compress pass.
+        Args:
+            strategy: The strategy to be added into current compress pass.
+        """
+        self.strategies.append(strategy)
+        self.epoch = max(strategy.end_epoch, self.epoch)
+
+    def apply(self, graph):
+        """
+        Compress a model.
+        Args:
+            graph: The target graph to be compressed.
+        """
+        self.executor = get_executor(graph, self.place)
+        context = Context(
+            self.executor, graph, self.scope, program_exe=self.program_exe)
+
+        for strategy in self.strategies:
+            strategy.on_compress_begin(context)
+
+        for epoch in range(self.epoch):
+
+            for strategy in self.strategies:
+                strategy.on_epoch_begin(context)
+
+            for data in self.data_reader():
+
+                for strategy in self.strategies:
+                    strategy.on_batch_begin(context)
+                fetches = None
+                if self.metrics:
+                    fetches = self.metrics.values()
+                feed = None
+                if self.data_feeder:
+                    feed = self.data_feeder.feed(data)
+                results = self.executor.run(graph,
+                                            fetches=fetches,
+                                            scope=self.scope,
+                                            feed=feed)
+                if results:
+                    print("results: {}".format(
+                        zip(self.metrics.keys(), results)))
+                for strategy in self.strategies:
+                    strategy.on_batch_end(context)
+                context.batch_id += 1
+
+            for strategy in self.strategies:
+                strategy.on_epoch_end(context)
+            context.epoch_id += 1
+
+        for strategy in self.strategies:
+            strategy.on_compress_end(context)
diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..811c45700376aff9883fe197007b582f63817f03
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/config.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import funcsigs
+import yaml
+from collections import OrderedDict
+from ..prune import *
+from .compress_pass import *
+from .strategy import *
+
+__all__ = ['ConfigFactory']
+"""This factory is used to create instances by loading and parsing configure file with yaml format.
+"""
+
+
+class ConfigFactory(object):
+    def __init__(self, config):
+        """Init a factory from configure file."""
+        self.instances = {}
+        self.version = None
+        self._parse_config(config)
+
+    def get_compress_pass(self):
+        """
+        Get compress pass from factory.
+        """
+        return self.instance('compress_pass')
+
+    def instance(self, name):
+        """
+        Get instance from factory.
+        """
+        if name in self.instances:
+            return self.instances[name]
+        else:
+            return None
+
+    def _new_instance(self, name, attrs):
+        if name not in self.instances:
+            class_ = globals()[attrs['class']]
+            sig = funcsigs.signature(class_.__init__)
+            keys = [
+                param.name for param in sig.parameters.values()
+                if (param.kind == param.POSITIONAL_OR_KEYWORD)
+            ][1:]
+            keys = set(attrs.keys()).intersection(set(keys))
+            args = {}
+            for key in keys:
+                value = attrs[key]
+                if isinstance(value, str) and value in self.instances:
+                    value = self.instances[value]
+                args[key] = value
+            self.instances[name] = class_(**args)
+        return self.instances.get(name)
+
+    def _parse_config(self, config):
+        assert config
+        with open(config, 'r') as config_file:
+            key_values = self._ordered_load(config_file)
+            for key in key_values:
+                # parse version
+                if key == 'version' and self.version is None:
+                    self.version = int(key_values['version'])
+                    assert self.version == int(key_values['version'])
+
+                # parse pruners
+                if key == 'pruners' or key == 'strategies':
+                    instances = key_values[key]
+                    for name in instances:
+                        self._new_instance(name, instances[name])
+
+                if key == 'compress_pass':
+                    compress_pass = self._new_instance(key, key_values[key])
+                    for name in key_values[key]['strategies']:
+                        strategy = self.instance(name)
+                        compress_pass.add_strategy(strategy)
+
+                if key == 'include':
+                    for config_file in key_values[key]:
+                        self._parse_config(config_file.strip())
+
+    def _ordered_load(self,
+                      stream,
+                      Loader=yaml.Loader,
+                      object_pairs_hook=OrderedDict):
+        """
+        See: https://stackoverflow.com/questions/5121931/in-python-how-can-you-load-yaml-mappings-as-ordereddicts
+        """
+
+        class OrderedLoader(Loader):
+            pass
+
+        def construct_mapping(loader, node):
+            loader.flatten_mapping(node)
+            return object_pairs_hook(loader.construct_pairs(node))
+
+        OrderedLoader.add_constructor(
+            yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping)
+        return yaml.load(stream, OrderedLoader)
diff --git a/python/paddle/fluid/contrib/slim/core/pass_builder.py b/python/paddle/fluid/contrib/slim/core/pass_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc1ddc94e04f1d606292071ba7e5cc74fedd5d36
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/pass_builder.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .compress_pass import CompressPass
+from .config import ConfigFactory
+
+__all__ = ['build_compressor']
+
+
+def build_compressor(place=None,
+                     data_reader=None,
+                     data_feeder=None,
+                     scope=None,
+                     metrics=None,
+                     epoch=None,
+                     config=None):
+    if config is not None:
+        factory = ConfigFactory(config)
+        comp_pass = factory.get_compress_pass()
+    else:
+        comp_pass = CompressPass()
+    comp_pass.place = place
+    comp_pass.data_reader = data_reader
+    comp_pass.data_feeder = data_feeder
+    comp_pass.scope = scope
+    comp_pass.metrics = metrics
+    comp_pass.epoch = epoch
+    return comp_pass
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d98e98b0c390599acfaefeb0636a599b46d391
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['Strategy']
+
+
+class Strategy(object):
+    """
+    Base class for all strategies.
+    """
+
+    def __init__(self, start_epoch=0, end_epoch=10):
+        """
+        Args:
+            start_epoch: The first epoch to apply the strategy.
+            end_epoch: The last epoch to apply the strategy.
+        """
+        self.start_epoch = start_epoch
+        self.end_epoch = end_epoch
+
+    def on_compress_begin(self, context):
+        pass
+
+    def on_epoch_begin(self, context):
+        pass
+
+    def on_epoch_end(self, context):
+        pass
+
+    def on_batch_begin(self, context):
+        pass
+
+    def on_batch_end(self, context):
+        pass
+
+    def on_compress_end(self, context):
+        pass
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea888fa2c74a23b4769f75dce6a776afcca41a51
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
@@ -0,0 +1,28 @@
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.3
+            'conv1_2.w': 0.4
+            '*': 0.9
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
+strategies:
+    strategy_1:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 0
+        end_epoch: 10
+        delta_rate: 0.20
+        acc_loss_threshold: 0.2
+        sensitivities:
+            'conv1_1.w': 0.4
+
+compress_pass:
+    class: 'CompressPass'
+    epoch: 100
+    strategies:
+        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..21c59c0c9d2d9b76932ab6eeff73754940a3bfa0
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle
+import os
+import sys
+from paddle.fluid.contrib.slim import CompressPass
+from paddle.fluid.contrib.slim import build_compressor
+from paddle.fluid.contrib.slim import ImitationGraph
+
+
+class LinearModel(object):
+    def __init__(slef):
+        pass
+
+    def train(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        startup_program.random_seed = 10
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+            eval_program = train_program.clone()
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+
+        train_reader = paddle.batch(
+            paddle.dataset.uci_housing.train(), batch_size=1)
+        eval_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=1)
+        place = fluid.CPUPlace()
+        train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+        train_metrics = {"loss": avg_cost.name}
+        eval_metrics = {"loss": avg_cost.name}
+
+        graph = ImitationGraph(train_program)
+        config = './config.yaml'
+        comp_pass = build_compressor(
+            place,
+            data_reader=train_reader,
+            data_feeder=train_feeder,
+            scope=fluid.global_scope(),
+            metrics=train_metrics,
+            epoch=1,
+            config=config)
+        comp_pass.apply(graph)
+
+
+if __name__ == "__main__":
+    model = LinearModel()
+    model.train()
diff --git a/python/paddle/fluid/contrib/slim/graph/__init__.py b/python/paddle/fluid/contrib/slim/graph/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d65472d193b639f0766e278ec14b5dc36c5d62bc
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import executor
+from .executor import *
+from . import graph
+from .graph import *
+from . import graph_pass
+from .graph_pass import *
+__all__ = executor.__all__
+__all__ += graph.__all__
+__all__ += graph_pass.__all__
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02c3af82013287bf19e1869cb60dc65239b720a
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from abc import abstractmethod
+from .... import executor
+from .graph import IRGraph, ImitationGraph
+
+__all__ = ['get_executor']
+
+
+class GraphExecutor(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, place):
+        self.place = place
+
+    @abstractmethod
+    def run(self, graph, feches=None, feed=None):
+        pass
+
+
+class IRGraphExecutor(GraphExecutor):
+    def run(self, grah, fetches, feed=None):
+        pass
+
+
+class ImitationGraphExecutor(GraphExecutor):
+    def __init__(self, place):
+        super(ImitationGraphExecutor, self).__init__(place)
+        self.exe = executor.Executor(place)
+
+    def run(self, graph, scope=None, fetches=None, feed=None):
+        assert isinstance(graph, ImitationGraph)
+        fetch_list = None
+        if fetches:
+            fetch_list = [
+                graph.program.global_block().var(name) for name in fetches
+            ]
+        results = self.exe.run(graph.program,
+                               scope=scope,
+                               fetch_list=fetch_list,
+                               feed=feed)
+        return results
+
+
+def get_executor(graph, place):
+    if isinstance(graph, ImitationGraph):
+        return ImitationGraphExecutor(place)
+    if isinstance(graph, IRGraph):
+        return IRGraphExecutor(place)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6b0702035d49189c0919f976ea3c0c52663547
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....framework import Program
+
+__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
+
+
+class Graph(object):
+    """
+    Base class for all graph.
+    """
+
+    def __init__(self):
+        pass
+
+    def all_parameters(self):
+        """
+        Return all the parameters in current graph.
+        """
+        pass
+
+
+class ImitationGraph(Graph):
+    def __init__(self, program=None):
+        super(ImitationGraph, self).__init__()
+        self.program = Program() if program is None else program
+
+    def all_parameters(self):
+        return self.program.global_block().all_parameters()
+
+
+class IRGraph(Graph):
+    pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_pass.py b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..1db6c4f110daa44be7fcbcc36f47224797b6dc88
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
@@ -0,0 +1,42 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['GraphPass', 'PruneParameterPass']
+
+
+class GraphPass(object):
+    """
+    Base class for all graph pass.
+    """
+
+    def __init__(self):
+        pass
+
+    def apply(self, graph):
+        pass
+
+
+class PruneParameterPass(GraphPass):
+    """
+    Generate a graph for pruning parameters from target graph.
+    """
+
+    def __init__(self, pruned_params, thresholds):
+        super(PruneParameterPass, self).__init__()
+        self.pruned_params = pruned_params
+        self.thresholds = thresholds
+        self.default_threshold = thresholds['*']
+
+    def apply(self, graph):
+        pass
diff --git a/python/paddle/fluid/contrib/slim/prune/__init__.py b/python/paddle/fluid/contrib/slim/prune/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..764a45bb130a9993015858f1cbdbc9f3b864bd5e
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/__init__.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import pruner
+from .pruner import *
+from . import prune_strategy
+from .prune_strategy import *
+
+__all__ = pruner.__all__
+__all__ += prune_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..34c5107daa3cde10e7995902be37e34e19664da8
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core.strategy import Strategy
+from ....framework import Program, program_guard
+from .... import layers
+import numpy as np
+
+__all__ = ['SensitivePruneStrategy', 'PruneStrategy']
+
+
+class SensitivePruneStrategy(Strategy):
+    def __init__(self,
+                 pruner=None,
+                 start_epoch=0,
+                 end_epoch=10,
+                 delta_rate=0.20,
+                 acc_loss_threshold=0.2,
+                 sensitivities=None):
+        super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch)
+        self.pruner = pruner
+        self.delta_rate = delta_rate
+        self.acc_loss_threshold = acc_loss_threshold
+        self.sensitivities = sensitivities
+
+
+class PruneStrategy(Strategy):
+    """
+    The strategy that pruning weights by threshold or ratio iteratively.
+    """
+
+    def __init__(self,
+                 pruner,
+                 mini_batch_pruning_frequency=1,
+                 start_epoch=0,
+                 end_epoch=10):
+        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
+        self.pruner = pruner
+        self.mini_batch_pruning_frequency = mini_batch_pruning_frequency
+
+    def _triger(self, context):
+        return (context.batch_id % self.mini_batch_pruning_frequency == 0 and
+                self.start_epoch <= context.epoch_id < self.end_epoch)
+
+    def on_batch_end(self, context):
+        if self._triger(context):
+            prune_program = Program()
+            with program_guard(prune_program):
+                for param in context.graph.all_parameters():
+                    prune_program.global_block().clone_variable(param)
+                    p = prune_program.global_block().var(param.name)
+                    zeros_mask = self.pruner.prune(p)
+                    pruned_param = p * zeros_mask
+                    layers.assign(input=pruned_param, output=param)
+            context.program_exe.run(prune_program, scope=context.scope)
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca72bcb6f6004c18f3ec794850e0aeaecb92d7ac
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from .... import layers
+
+__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner']
+
+
+class Pruner(object):
+    """
+    Base class of all pruners.
+    """
+
+    def __init__(self):
+        pass
+
+    def prune(self, param):
+        pass
+
+
+class MagnitudePruner(Pruner):
+    """
+    Pruner used to pruning a parameter by threshold.
+    """
+
+    def __init__(self, threshold):
+        self.threshold = threshold
+
+    def prune(self, param, threshold=None):
+        if threshold is None:
+            thres = layers.fill_constant(
+                shape=[1], dtype='float32', value=self.threshold)
+        else:
+            thres = threshold
+        zeros_mask = layers.less_than(x=param, y=thres)
+        return zeros_mask
+
+
+class RatioPruner(Pruner):
+    """
+    Pruner used to pruning a parameter by ratio.
+    """
+
+    def __init__(self, ratios=None):
+        """
+        Args:
+            ratios: dict with pair (paramer_name, pruned_ratio). 
+        """
+        self.ratios = ratios
+
+    def prune(self, param, ratio=None):
+        """
+        Args:
+            ratio: `ratio=40%` means pruning (1 - 40%) weights to zero.
+        """
+        if ratio is None:
+            rat = self.ratios[
+                param.name] if param.name in self.ratios else self.ratios['*']
+        else:
+            rat = ratio
+        if rat < 1.0:
+            k = max(int(rat * np.prod(param.shape)), 1)
+            param_vec = layers.reshape(x=param, shape=[1, -1])
+            param_topk, _ = layers.topk(param_vec, k=k)
+            threshold = layers.slice(
+                param_topk, axes=[1], starts=[-1], ends=[k])
+            threshold = layers.reshape(x=threshold, shape=[1])
+            zeros_mask = layers.less_than(x=param, y=threshold)
+        else:
+            zeros_mask = layers.ones(param.shape)
+        return zeros_mask
diff --git a/python/paddle/fluid/contrib/slim/unitest/__init__.py b/python/paddle/fluid/contrib/slim/unitest/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d41233e227dc7bab94ee4284cc25e12b45bf469
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db488b96330210df15b02b19d90abd5c9101f844
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
@@ -0,0 +1,29 @@
+version: 1.0
+include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"]
+pruners:
+    pruner_1:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.3
+            'conv1_2.w': 0.4
+            '*': 0.9
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
+strategies:
+    strategy_1:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_2'
+        start_epoch: 0
+        end_epoch: 10
+        delta_rate: 0.20
+        acc_loss_threshold: 0.2
+        sensitivities:
+            'conv1_1.w': 0.4
+
+compress_pass:
+    class: 'CompressPass'
+    epoch: 100
+    strategies:
+        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..235092c595bf7c653221c7fe2b381fecf487fa49
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
@@ -0,0 +1,12 @@
+version: 1.0
+pruners:
+    pruner_2:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.5
+            'conv1_2.w': 0.2
+            '*': 0.7
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd2ef9eb56ddbc1367ce2e3b413372fbcd542bde
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
@@ -0,0 +1,12 @@
+version: 1.0
+pruners:
+    pruner_3:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.5
+            'conv1_2.w': 0.2
+            '*': 0.7
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_factory.py b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f28aac905d1a2813dbde6143235c7916fd9278
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.contrib.slim import ConfigFactory
+import unittest
+
+
+class TestFactory(unittest.TestCase):
+    def test_parse(self):
+        factory = ConfigFactory('./unitest/configs/config.yaml')
+
+        pruner = factory.instance('pruner_1')
+        self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
+
+        pruner = factory.instance('pruner_2')
+        self.assertEquals(pruner.ratios['*'], 0.7)
+
+        strategy = factory.instance('strategy_1')
+        pruner = strategy.pruner
+        self.assertEquals(pruner.ratios['*'], 0.7)
+
+        compress_pass = factory.get_compress_pass()
+        self.assertEquals(compress_pass.epoch, 100)
+
+        strategy = compress_pass.strategies[0]
+        self.assertEquals(strategy.delta_rate, 0.2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index 0b61c23d07e95acf7b4564753f748e7fb497e73e..8485d7d32fed8554c6d9afd610db230f52497da1 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 try:
-    from .graphviz import Digraph
+    from .graphviz import Graph
 except ImportError:
     logger.info(
         'Cannot import graphviz, which is required for drawing a network. This '
@@ -112,7 +112,7 @@ def draw_graph(startup_program, main_program, **kwargs):
     filename = kwargs.get("filename")
     if filename == None:
         filename = str(graph_id) + ".gv"
-    g = Digraph(
+    g = Graph(
         name=str(graph_id),
         filename=filename,
         graph_attr=GRAPH_STYLE,
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
index 61ac8790112ceadfdef7b18aad70af77644581cd..0c201b9e4f48df94924a248d820ae2cf73367560 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
@@ -23,16 +23,6 @@ class TestTransposeMKLDNN(TestTransposeOp):
     def init_op_type(self):
         self.op_type = "transpose2"
         self.use_mkldnn = True
-        self.is_test = True
-        return
-
-    def test_check_grad(self):
-        return
-
-    def test_check_grad_no_input(self):
-        return
-
-    def test_check_grad_no_filter(self):
         return
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 93be9d28da7a73f4fa972acf0dbd95167e7dfca3..a38540a7240636415ef4703609c5a3e8e83ed1da 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -27,7 +27,6 @@ class TestTransposeOp(OpTest):
         self.attrs = {
             'axis': list(self.axis),
             'use_mkldnn': self.use_mkldnn,
-            'is_test': self.is_test,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float32"),
@@ -37,7 +36,6 @@ class TestTransposeOp(OpTest):
     def init_op_type(self):
         self.op_type = "transpose2"
         self.use_mkldnn = False
-        self.is_test = False
 
     def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
diff --git a/python/requirements.txt b/python/requirements.txt
index 2f81d85df0626b294f4d861706b5c1b7ec9841d5..03d5e33e88cd5f1138ca8f6a6e885d6acfbc260e 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -9,3 +9,5 @@ Pillow
 nltk>=3.2.2
 graphviz
 six
+funcsigs
+pyyaml
diff --git a/python/setup.py.in b/python/setup.py.in
index 5d5f2dd0f18cd3e707dca8b9f337f2f2a07d47aa..c9afe6c885658b88ac520aad2e7b13facda02a92 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,6 +109,10 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.slim',
+          'paddle.fluid.contrib.slim.core',
+          'paddle.fluid.contrib.slim.graph',
+          'paddle.fluid.contrib.slim.prune',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
@@ -140,8 +144,6 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
 package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
-if os.name == 'nt':
-    package_data['paddle.fluid'] += ['openblas' + ext_name]
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
@@ -166,11 +168,17 @@ package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') +
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 
 if '${WITH_MKL}' == 'ON':
-    shutil.copy('${MKLML_LIB}', libs_path)
-    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
-    package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
+    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
+    package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name]
+else:
+    if os.name == 'nt':
+        # copy the openblas.dll
+        shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path)
+        package_data['paddle.libs'] += ['openblas' + ext_name]
+
 if '${WITH_MKLDNN}' == 'ON':
-    if '${CMAKE_BUILD_TYPE}' == 'Release':
+    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
         # only change rpath in Release mode.
         # TODO(typhoonzero): use install_name_tool to patch mkl libs once
         # we can support mkl on mac.
@@ -181,7 +189,7 @@ if '${WITH_MKLDNN}' == 'ON':
         command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
         if os.system(command) != 0:
             raise Exception("patch libmkldnn.so failed, command: %s" % command)
-    package_data['paddle.libs']+=['libmkldnn.so.0']
+    package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)]
     shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 if '${WITH_NGRAPH}' == 'ON':
     # only change rpath in Release mode,