Merge branch 'develop' into mm_dnn

f01c9668 · Tao Luo · 22c71398 · 51a9fca3 · f01c9668 · f01c9668
66 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,16 +126,12 @@ if(ANDROID OR IOS)
    add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
-if (APPLE OR WIN32)
+if (APPLE)
    set(WITH_MKL OFF CACHE STRING
-        "Disable MKL for building on mac and windows" FORCE)
+        "Disable MKL for building on mac" FORCE)
 endif()
 if (WIN32)
-    set(WITH_DSO OFF CACHE STRING
-            "Disable DSO when compiling for Windows" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-            "Disable MKL when compiling for Windows" FORCE)
    set(WITH_DISTRIBUTE OFF CACHE STRING
            "Disable DISTRIBUTE when compiling for Windows" FORCE)
    set(WITH_C_API OFF CACHE STRING

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -44,9 +44,9 @@ if(WIN32)
 set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)
-if(Apple)
+if(APPLE)
 set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
-endif(Apple)
+endif(APPLE)
 find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -23,15 +23,14 @@ SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-IF(WIN32 OR APPLE)
+IF(APPLE)
    MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLDNN in Paddle yet."
+        "Mac is not supported with MKLDNN in Paddle yet."
        "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE)
    return()
 ENDIF()
-SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
 MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
@@ -44,10 +43,14 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+IF(NOT WIN32)
-SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+ENDIF(NOT WIN32)
 ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -58,8 +61,15 @@ ExternalProject_Add(
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
@@ -67,6 +77,11 @@ ExternalProject_Add(
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
+if(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
+else(WIN32)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+endif(WIN32)
 ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
@@ -85,10 +100,14 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
-SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+if(WIN32)
-ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll)
-    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+else(WIN32)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
            DEPENDS mkldnn)
+endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
 IF(WITH_C_API)

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,56 +16,67 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
-IF(WIN32 OR APPLE)
+IF(APPLE)
    MESSAGE(WARNING
-        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Mac is not supported with MKLML in Paddle yet."
        "Force WITH_MKLML=OFF")
    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
    return()
 ENDIF()
 INCLUDE(ExternalProject)
-SET(MKLML_PROJECT       "extern_mklml")
-IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
-  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
+if(WIN32)
-SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+else()
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
+endif()
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
-INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+    MESSAGE(STATUS "use pre defined download url")
+    if(WIN32)
+        SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    else()
+        SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    ENDIF()
+endif()
-FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
+SET(MKLML_PROJECT       "extern_mklml")
-  "PROJECT(MKLML)\n"
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-  "cmake_minimum_required(VERSION 3.0)\n"
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
-  "        DESTINATION ${MKLML_DST_DIR})\n")
 ExternalProject_Add(
    ${MKLML_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    PREFIX                 ${MKLML_SOURCE_DIR}
+    URL                    ${MKLML_URL}
    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
-                          && tar zxf ${MKLML_VER}.tgz
    DOWNLOAD_NO_PROGRESS  1
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
    UPDATE_COMMAND ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
+    INSTALL_COMMAND
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} &&
+        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR}
 )
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,7 +267,11 @@ function(cc_library TARGET_NAME)
          list(APPEND cc_library_DEPS dynload_mklml)
        endif()
        add_dependencies(${TARGET_NAME} mklml)
+        if(WIN32)
+          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
+        else(WIN32)
          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+        endif(WIN32)
      endif()
      # remove link to python, see notes at:
      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -115,20 +115,20 @@ if (NOT PROTOBUF_FOUND OR WIN32)
            )
 endif ()
-if (NOT CBLAS_FOUND)
+if (WITH_MKLML)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
-    copy(openblas_lib
-            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-            DSTS ${dst_dir} ${dst_dir}
-            DEPS extern_openblas
-            )
-elseif (WITH_MKLML)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
    copy(mklml_lib
            SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
            DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
            DEPS mklml
            )
+elseif (NOT CBLAS_FOUND OR WIN32)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
+    copy(openblas_lib
+            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS extern_openblas
+            )
 endif ()
 if (WITH_MKLDNN)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -351,6 +351,23 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
+paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None))
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -157,13 +157,8 @@ bool CheckLoD(const LoD &in, int tensor_height) {
    if (level.size() < 2) return false;
    // check: the first offset(the begin offset) of each level should be 0.
    if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(no same items
+    // check: all the offsets in a level should be ascending(allow same items)
-    // allows).
+    if (!std::is_sorted(level.begin(), level.end())) {
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      LOG(INFO) << "ascending error";
      return false;
    }
  }

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -217,6 +217,11 @@ TEST(LoD, CheckLoD) {
  // check with underlying tensor storage.
  ASSERT_TRUE(CheckLoD(relative_lod, 5));
  ASSERT_FALSE(CheckLoD(relative_lod, 9));
+  // check whether lod is ascending-sorted (allow same items)
+  ASSERT_TRUE(CheckLoD({{0, 1, 2, 3, 4, 5}}, 5));
+  ASSERT_TRUE(CheckLoD({{0, 1, 3, 3, 4, 5}}, 5));
+  ASSERT_FALSE(CheckLoD({{0, 1, 3, 2, 5}}, 5));
 }
 TEST(LoD, CheckAbsLoD) {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -476,6 +476,28 @@ const Tensor* ExecutionContext::LegacyInput<Tensor>(
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  if (it == ctx_.inputs.end()) {
+    return {};
+  }
+  const std::vector<Variable*>& vars = it->second;
+  std::vector<const Tensor*> res;
+  res.reserve(vars.size());
+  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                 [&](Variable* var) -> const Tensor* {
+                   if (var == nullptr) return nullptr;
+                   PADDLE_ENFORCE(
+                       var->IsType<LoDTensor>(),
+                       "should be LoDTensor, but the received type is %s",
+                       var->Type().name());
+                   return &(var->Get<LoDTensor>());
+                 });
+  return res;
+}
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const {
  auto names = op().Inputs(name);
  std::vector<const Tensor*> res;
  res.reserve(names.size());

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -197,8 +197,31 @@ class ExecutionContext {
  const std::vector<const Variable*> MultiInputVar(
      const std::string& name) const {
-    auto names = op_.Inputs(name);
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
    std::vector<const Variable*> res;
+    res.reserve(it->second.size());
+    std::transform(it->second.begin(), it->second.end(),
+                   std::back_inserter(res),
+                   [this](Variable* var) { return var; });
+    return res;
+  }
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    return it->second;
+  }
+  const std::vector<Variable*> LegacyMultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<Variable*> res;
    res.reserve(names.size());
    std::transform(names.begin(), names.end(), std::back_inserter(res),
                   [this](const std::string& name) {
@@ -208,7 +231,7 @@ class ExecutionContext {
    return res;
  }
-  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
    auto names = op_.Outputs(name);
    std::vector<Variable*> res;
    res.reserve(names.size());
@@ -250,6 +273,38 @@ class ExecutionContext {
  template <typename T>
  const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto it = ctx_.inputs.find(name);
+    if (it == ctx_.inputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<const T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> const T* {
+                     return var == nullptr ? nullptr : &var->Get<T>();
+                   });
+    return res;
+  }
+  template <typename T>
+  std::vector<T*> MultiOutput(const std::string& name) const {
+    auto it = ctx_.outputs.find(name);
+    if (it == ctx_.outputs.end()) {
+      return {};
+    }
+    const std::vector<Variable*>& vars = it->second;
+    std::vector<T*> res;
+    res.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                   [&](Variable* var) -> T* {
+                     return var == nullptr ? nullptr : var->GetMutable<T>();
+                   });
+    return res;
+  }
+  template <typename T>
+  const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
    auto names = op_.Inputs(name);
    std::vector<const T*> res;
    res.reserve(names.size());
@@ -262,7 +317,7 @@ class ExecutionContext {
  }
  template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
+  std::vector<T*> LegacyMultiOutput(const std::string& name) const {
    auto names = op_.Outputs(name);
    std::vector<T*> res;
    res.reserve(names.size());
@@ -321,6 +376,10 @@ template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const;
+template <>
+const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
+    const std::string& name) const;
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;

--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 namespace paddle {
 namespace framework {
@@ -27,6 +28,9 @@ void Tensor::check_memory_size() const {
      "or maybe the required data-type mismatches the data already stored.");
 }
+Tensor::Tensor(std::type_index type)
+    : type_(framework::ToDataType(type)), offset_(0) {}
 size_t Tensor::memory_size() const {
  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
@@ -101,5 +105,12 @@ const DDim& Tensor::dims() const { return dims_; }
 int64_t Tensor::numel() const { return product(dims_); }
+void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
+  if (holder_) {
+    PADDLE_ENFORCE_EQ(numel() * SizeOfType(type()), holder->size());
+  }
+  holder_ = holder;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,6 +69,8 @@ class Tensor {
 public:
  Tensor() : type_(proto::VarType::FP32), offset_(0) {}
+  explicit Tensor(std::type_index type);
  /*! Return a pointer to mutable memory block. */
  template <typename T>
  T* data();
@@ -162,6 +164,8 @@ class Tensor {
    return std::move(holder_);
  }
+  void ResetHolder(std::shared_ptr<memory::Allocation> holder);
 private:
  /*! holds the memory block if allocated. */
  std::shared_ptr<memory::Allocation> holder_;

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                  inputs[i].data.length());
    } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
+                   inputs[i].data.length(), dev_ctx->stream());
-                   0);  // stream 0 for sync copy
 #else
      PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                  inputs[i].data.length());
    } else {
 #ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
+                   inputs[i].data.length(), dev_ctx->stream());
-                   0);  // stream 0 for sync copy
 #else
      PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -89,12 +89,21 @@ endif()
 if(WITH_MKL)
  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  if(NOT WIN32)
    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
                 ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX}
+            ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif(WIN32)
  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
  if(EXISTS ${MKLDNN_PATH})
    include_directories("${MKLDNN_PATH}/include")
+    if(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+    else(WIN32)
      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    endif(WIN32)
  endif()
 else()
  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
 namespace paddle {
 namespace operators {
@@ -123,6 +124,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    auto& dev_ctx = context.template device_context<DeviceContext>();
    const int batch_size = static_cast<int>(input->dims()[0]);
    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
@@ -155,13 +158,19 @@ class GemmConvKernel : public framework::OpKernel<T> {
    // to call the matrix multiplication interface.
    Tensor col_matrix;
    if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      Tensor tep_tensor =
+          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
+      col.ShareDataWith(tep_tensor);
      col_matrix.ShareDataWith(col);
      col_matrix.Resize(col_matrix_shape);
    }
-    framework::DDim input_shape = framework::slice_ddim(
+    framework::DDim input_shape =
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
@@ -178,7 +187,6 @@ class GemmConvKernel : public framework::OpKernel<T> {
    math::Vol2ColFunctor<DeviceContext, T> vol2col;
    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
    for (int i = 0; i < batch_size; i++) {
      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -237,6 +245,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    const int batch_size = static_cast<int>(input->dims()[0]);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
@@ -262,8 +272,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    framework::DDim col_matrix_shape =
        framework::flatten_to_2d(col_shape, data_dim + 1);
-    framework::DDim input_shape = framework::slice_ddim(
+    framework::DDim input_shape =
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
@@ -286,13 +296,18 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    // to call the matrix multiplication interface.
    Tensor col_matrix;
    if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
+      auto tmp_allocation_ptr =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+              framework::product(col_shape) * sizeof(T));
+      Tensor tep_tensor =
+          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
+      col.ShareDataWith(tep_tensor);
      col_matrix.ShareDataWith(col);
      col_matrix.Resize(col_matrix_shape);
    }
    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
    if (input_grad) {

--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
    vars->mutable_data<T>(ctx.GetPlace());
    framework::Tensor d_temp;
-    framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
+    framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
    // At least use 32 threads, at most 512 threads.
    // blockx is multiple of 32.
    int blockx = std::min(
-        static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L);
+        static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
+        512L);
    int gridx = (feature_width * num_priors + blockx - 1) / blockx;
    dim3 threads(blockx, 1);
    dim3 grids(gridx, feature_height);

--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -16,11 +16,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/operators/jit/kernels.h"
+#ifdef PADDLE_WITH_XBYAK
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -81,8 +84,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
    UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
    UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
-    Xbyak::util::Cpu cpu;
+    const bool is_avx512_enabled = platform::MayIUse(platform::avx512f);
-    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
    const bool are_dims_divisable = !(x_int_dims[1] % 16);
    const bool is_x_format_correct = x->format() == memory::format::nChw16c;
    const bool is_y_format_correct = y->format() == memory::format::nc;

--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -21,5 +21,5 @@ endif()
 cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
 cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
 if(NOT WIN32)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer)
 endif()
--- a/paddle/fluid/operators/jit/README.en.md
+++ b/paddle/fluid/operators/jit/README.en.md
+# JIT Kernel
+JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
+Each implementations has its own condition to use, defined in `UseMe`.
+They are combined together to get the best performance of one single independent function.
+They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
+And they can be composed with some other exited jit kernels to build up a complex function. 
+Currently it's only supported on CPU yet.
+## Contents
+```txt
+PaddlePaddle/Paddle/paddle/fluid/
+├── ...
+└── operators/
+    ├── .../
+    └── jit/
+        ├── ...
+        ├── gen/
+        │   └── ...
+        |── more/
+        │   ├── ...
+        │   ├── mkl/
+        │   │   └── ...
+        │   ├── mkldnn/
+        │   │   └── ...
+        │   ├── mix/
+        │   │   └── ...
+        │   ├── intrinsic/
+        │   │   └── ...
+        │   └── openblas/
+        │       └── ...
+        └── refer/
+            └── ...
+```
+All basical definations of jit kernels are addressed in `paddle/fluid/operators/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality.
+- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries.
+- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak.
+- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. 
+## How to use
+One simple function `jit::Get`, which is very easy to use, is supported to get the kernel.
+It can automatically return the expected function with best performance under the given attributes. 
+All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, you can only include this one header to get all the registered kernels.
+## Solid Test
+- Unit Test
+    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
+- Benchmark
+    All functions should be tested, and make sure the `jit::Get` function obtain the best performance with all attributes.
+# How to add new kernel
+## Required
+1. Add `your_key` at `KernelType`.
+2. Add reference function of `your_key`. 
+Note:
+    - this should be run on CPU and do not depend on any third-party.
+    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+3. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
+Test more data type for some special functions if necessary, for example `int8`.
+4. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `jit::Get` always get the best one.
+## Optional
+Add more implementations of `your_kery` for performance enhancement.
+1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corepsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
+Note: Add new `KernelTuples` if necessary，your can refer to `XYZNTuples`.
+Specialie method `JitCodeKey` when add new attribute type。
+2. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
@@ -10,9 +10,9 @@
 ```txt
 PaddlePaddle/Paddle/paddle/fluid/
 ├── ...
-├── operator/
+└── operators/
-│   ├── .../
+    ├── .../
-└── jit/
+    └── jit/
        ├── ...
        ├── gen/
        │   └── ...

--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -36,6 +36,8 @@ class GenBase : public Kernel {
    if (FLAGS_dump_jitcode) {
      this->dumpCode(code);
    }
+    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
+    // then workaround with const_cast. Any better idea is appreciated.
    return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
  }

--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -131,9 +131,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
    int in_col = input[0].numel() / in_row;
    int out_row = in_row, out_col = 0;
-    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
+    std::vector<T*> inputs_data(in_num);
-    framework::Vector<int> inputs_col(in_num + 1);
+    std::vector<int> inputs_col(in_num + 1);
-    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
    inputs_col[0] = 0;
    bool sameShape = true;
@@ -144,12 +143,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
      }
      out_col += t_cols;
      inputs_col[i + 1] = out_col;
-      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
+      inputs_data[i] = const_cast<T*>(input[i].data<T>());
    }
-    T** dev_ins_data =
-        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
    // computation
    // set the thread block and grid according to CurrentDeviceId
    const int kThreadsPerBlock = 1024;
@@ -169,18 +165,32 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+    auto tmp_dev_ins_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            inputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_ins_data->ptr(), platform::CPUPlace(),
+                 static_cast<void*>(inputs_data.data()),
+                 inputs_data.size() * sizeof(T*), context.stream());
+    T** dev_ins_data = reinterpret_cast<T**>(tmp_dev_ins_data->ptr());
    if (sameShape) {
      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
          dev_ins_data, in_col, out_row, out_col, output->data<T>());
    } else {
-      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
+      auto tmp_dev_ins_col_data =
+          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+              inputs_col.size() * sizeof(int));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                   static_cast<void*>(inputs_col.data()),
+                   inputs_col.size() * sizeof(int), context.stream());
+      int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data->ptr());
      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
          out_row, out_col, output->data<T>());
    }
-    // Wait() must be called because `inputs_data` may be destructed before
-    // kernel ends
-    context.Wait();
  }
 };
@@ -207,9 +217,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
    int in_col = 0, in_row = out_row;
    bool sameShape = true;
-    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
+    std::vector<T*> outputs_data(o_num);
-    framework::Vector<int> outputs_cols(o_num + 1);
+    std::vector<int> outputs_cols(o_num + 1);
-    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
    outputs_cols[0] = 0;
    for (int i = 0; i < o_num; ++i) {
@@ -220,15 +229,12 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
      in_col += t_col;
      outputs_cols[i + 1] = in_col;
      if (outputs->at(i) != nullptr) {
-        outputs_ptr[i] = outputs->at(i)->data<T>();
+        outputs_data[i] = outputs->at(i)->data<T>();
      } else {
-        outputs_ptr[i] = nullptr;
+        outputs_data[i] = nullptr;
      }
    }
-    T** dev_out_gpu_data =
-        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
    // computation
    const int kThreadsPerBlock = 1024;
    int block_cols = kThreadsPerBlock;
@@ -247,18 +253,33 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+    auto tmp_dev_outs_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            outputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_outs_data->ptr(), platform::CPUPlace(),
+                 reinterpret_cast<void*>(outputs_data.data()),
+                 outputs_data.size() * sizeof(T*), context.stream());
+    T** dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
    if (sameShape) {
      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
    } else {
-      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
+      auto tmp_dev_ins_col_data =
+          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+              outputs_cols.size() * sizeof(int));
+      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                   reinterpret_cast<void*>(outputs_cols.data()),
+                   outputs_cols.size() * sizeof(int), context.stream());
+      int* dev_outs_col_data =
+          reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
          input.data<T>(), in_row, in_col, dev_outs_col_data,
          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
    }
-    // Wait() must be called because `outputs_data` may be destructed before
-    // kernel ends
-    context.Wait();
  }
 };

--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
+#if defined(_WIN32)
+#if defined(__AVX2__) || defined(__AVX__)
+inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
+#endif
+#endif
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -92,8 +92,8 @@ template <typename T>
 class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-                       .eigen_device();
+    auto& place = *dev_ctx.eigen_device();
    // get input and output tensor
    auto* predictions = ctx.Input<Tensor>("Predictions");
    auto* labels = ctx.Input<Tensor>("Labels");
@@ -115,11 +115,11 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
-    // Temporary tensor
+    // Temporary memory
-    Tensor ious;
+    auto& allocator =
-    float* ious_data = ious.mutable_data<float>(
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
-        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    auto tmp_ious_data = allocator.Allocate(num_classes * sizeof(float));
-    auto ious_t = EigenTensor<float, 1>::From(ious);
+    float* ious_data = static_cast<float*>(tmp_ious_data->ptr());
    // Init out_wrong, out_correct and out_mean_iou
    out_wrong_t.device(place) = out_wrong_t.constant(0);
@@ -148,7 +148,7 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
    CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
        num_classes, predictions->numel(), predictions_data, labels_data,
        out_wrong_data, out_correct_data);
-    ctx.device_context().Wait();
    ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
                                                  out_correct_data, ious_data,
                                                  out_mean_iou_data);

--- a/paddle/fluid/operators/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
@@ -29,10 +29,6 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                   "It must use CPUPlace.");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    PADDLE_ENFORCE(
-        is_test == true,
-        "TransposeMKLDNN works only for inference!. Set is_test = True");
    auto& dev_ctx =
        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -68,6 +64,57 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  }
 };
+template <typename T>
+class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    if (!x_grad) return;
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    int ndims = axis.size();
+    if (ndims == 1) {
+      x_grad->ShareDataWith(*out_grad);
+      return;
+    }
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+    const T* out_grad_data = out_grad->data<T>();
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> nchw_tz =
+        paddle::framework::vectorize2int(out_grad->dims());
+    const std::string key = platform::TransposeMKLDNNHandler::GetHash(
+        nchw_tz, axis, ctx.op().Output(framework::GradVarName("X")));
+    platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
+                                             mkldnn_engine, key);
+    auto transpose_src_memory_p = handler.AcquireSrcMemory(
+        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
+    auto transpose_dst_memory_p =
+        handler.AcquireDstMemory(x_grad, ctx.GetPlace());
+    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
+                                                transpose_src_memory_p);
+    std::vector<mkldnn::primitive> pipeline;
+    pipeline.push_back(*transpose_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -77,3 +124,8 @@ REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::TransposeMKLDNNOpKernel<float>);
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::TransposeMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(transpose_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNGradOpKernel<float>);
+REGISTER_OP_KERNEL(transpose2_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -79,10 +79,6 @@ class TransposeOp : public framework::OperatorWithKernel {
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
    AddInput(
        "X",
        "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
@@ -147,6 +143,24 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
    }
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace(), layout_, library_);
+  }
 };
 // FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
@@ -237,9 +251,19 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
    return framework::OpKernelType(
        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
+        ctx.GetPlace(), layout_, library_);
  }
 };

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,6 +56,8 @@ ELSE()
    set(MKLDNN_CTX_DEPS)
 ENDIF()
+cc_library(temp_allocator SRCS temporary_allocator.cc DEPS  allocator_facade)
 nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
 IF(WITH_GPU)
  set(STREAM_CALLBACK_DEPS stream_callback_manager)
@@ -66,7 +68,8 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}  temp_allocator)
 if(WIN32)
    if(WITH_GPU AND NOT WITH_DSO)
        get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
@@ -92,3 +95,9 @@ IF(WITH_GPU)
  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
 ENDIF()
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+if(WITH_GPU)
+    nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+else()
+    cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+endif()
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
 #elif defined(_WIN32)
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #include <windows.h>

--- a/paddle/fluid/platform/create_tensor_with_allocationptr.h
+++ b/paddle/fluid/platform/create_tensor_with_allocationptr.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
+namespace paddle {
+namespace platform {
+template <typename T>
+paddle::framework::Tensor GetTensor(
+    memory::allocation::AllocationPtr temp_allocation_ptr,
+    const framework::DDim &dim) {
+  auto &deleter = temp_allocation_ptr.get_deleter();
+  auto *allocation_ptr = temp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
+  PADDLE_ENFORCE(dynamic_cast<TemporaryAllocation *>(allocation_ptr) != nullptr,
+                 "The AllocationPtr must be TemporaryAllocation.");
+  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                    framework::product(dim) * sizeof(T));
+  paddle::framework::Tensor temp_tensor(std::type_index(typeid(T)));
+  temp_tensor.Resize(dim);
+  temp_tensor.ResetHolder(std::move(shared_allocation));
+  return temp_tensor;
+}
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -85,6 +85,49 @@ DeviceContextPool::DeviceContextPool(
  }
 }
+DeviceTemporaryAllocator* DeviceTemporaryAllocator::allocators = nullptr;
+#ifdef PADDLE_WITH_CUDA
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::Place& place, const cudaStream_t& stream) {
+  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  auto place_stream = std::make_pair(place, stream);
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    if (!device_allocator_.count(place_stream)) {
+      device_allocator_[place_stream].reset(new TemporaryAllocator(place));
+      device_allocator_[place_stream]->SetCallback([stream]() {
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+        PADDLE_ENFORCE(cudaGetLastError());
+      });
+    }
+  }
+  return *device_allocator_.at(place_stream);
+}
+template <>
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::CUDADeviceContext& dev_ctx) {
+  auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream());
+  if (device_allocator_.count(place_stream)) {
+    return *device_allocator_.at(place_stream);
+  }
+  return Get(dev_ctx.GetPlace(), dev_ctx.stream());
+}
+#endif
+template <>
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::CPUDeviceContext& dev_ctx) {
+  return cpu_allocator_;
+}
+platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
+    const platform::Place& place) {
+  PADDLE_ENFORCE(platform::is_cpu_place(place), "You should pass CPUPlace");
+  return cpu_allocator_;
+}
 CPUDeviceContext::CPUDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
 }
@@ -271,8 +314,12 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 void CUDADeviceContext::Wait() const {
+  auto& allocator =
+      DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this);
+  allocator.Release([=]() {
    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
    PADDLE_ENFORCE(cudaGetLastError());
+  });
 }
 int CUDADeviceContext::GetComputeCapability() const {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
@@ -39,6 +41,50 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
+/*! \brief device temporary allocator singleton */
+class DeviceTemporaryAllocator {
+ public:
+  static DeviceTemporaryAllocator& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(allocators,
+                            "Need to Create DeviceTemporaryAllocator first!");
+    return *allocators;
+  }
+  static DeviceTemporaryAllocator& Init() {
+    if (allocators == nullptr) {
+      allocators = new DeviceTemporaryAllocator();
+    }
+    return *allocators;
+  }
+/*! \brief  Return handle of single temporary allocator. */
+#ifdef PADDLE_WITH_CUDA
+  platform::TemporaryAllocator& Get(const platform::Place& place,
+                                    const cudaStream_t& stream);
+#endif
+  template <typename DeviceContext>
+  platform::TemporaryAllocator& Get(const DeviceContext& dev_ctx);
+  platform::TemporaryAllocator& Get(const platform::Place& place);
+ private:
+  DeviceTemporaryAllocator() : cpu_allocator_(platform::CPUPlace()) {}
+  static DeviceTemporaryAllocator* allocators;
+  platform::TemporaryAllocator cpu_allocator_;
+#ifdef PADDLE_WITH_CUDA
+  std::map<std::pair<platform::Place, cudaStream_t>,
+           std::unique_ptr<platform::TemporaryAllocator>>
+      device_allocator_;
+#endif
+  std::mutex mtx_;
+  DISABLE_COPY_AND_ASSIGN(DeviceTemporaryAllocator);
+};
 class DeviceContext {
 public:
  virtual ~DeviceContext() {}

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -227,6 +227,8 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
 #else
  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
 #endif

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -110,7 +110,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  }
  places.emplace_back(platform::CPUPlace());
  platform::DeviceContextPool::Init(places);
+  platform::DeviceTemporaryAllocator::Init();
 #ifndef PADDLE_WITH_MKLDNN
  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif

--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/temporary_allocator.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+DEFINE_double(limit_of_temporary_allocation, -1,
+              "The up limit of temporary_allocation size.");
+namespace paddle {
+namespace platform {
+namespace alloc = memory::allocation;
+TemporaryAllocation::TemporaryAllocation(
+    alloc::AllocationPtr &&underlying_allocation)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)) {}
+TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
+  temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+}
+bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
+void TemporaryAllocator::Release(const std::function<void()> &callback) {
+  std::shared_ptr<std::deque<TemporaryAllocation *>> t_allocations;
+  {
+    std::unique_lock<std::mutex> lock(mtx_);
+    callback();
+    t_allocations = temp_mem_queue_;
+    temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
+    wait_delete_mem_ = 0;
+  }
+  for (auto tmp : *t_allocations) {
+    VLOG(10) << "Delete temporary allocation " << tmp->ptr()
+             << " size: " << tmp->size();
+    delete tmp;
+  }
+}
+void TemporaryAllocator::Free(alloc::Allocation *allocation) {
+  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
+  if (platform::is_gpu_place(temp_allocation->place())) {
+    size_t wait_delete_mem = 0;
+    {
+      std::unique_lock<std::mutex> lock(mtx_);
+      temp_mem_queue_->emplace_back(temp_allocation);
+      wait_delete_mem_ += temp_allocation->size();
+      wait_delete_mem = wait_delete_mem_;
+      VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr()
+               << " to delete queue: " << temp_allocation->size() << "; "
+               << "wait_delete_mem: " << wait_delete_mem_;
+    }
+    if (FLAGS_limit_of_temporary_allocation > 0 &&
+        wait_delete_mem > FLAGS_limit_of_temporary_allocation) {
+      Release(callback_);
+    }
+    return;
+  }
+  delete temp_allocation;
+}
+size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
+  std::unique_lock<std::mutex> lock(mtx_);
+  return temp_mem_queue_ ? temp_mem_queue_->size() : 0;
+}
+void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
+  callback_ = callback;
+}
+alloc::Allocation *TemporaryAllocator::AllocateImpl(
+    size_t size, alloc::Allocator::Attr attr) {
+  auto raw_allocation =
+      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
+  VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
+  return temp_mem;
+}
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <condition_variable>  // NOLINT
+#include <deque>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+namespace paddle {
+namespace platform {
+class TemporaryAllocation : public memory::allocation::Allocation {
+ public:
+  explicit TemporaryAllocation(
+      memory::allocation::AllocationPtr &&underlying_allocation);
+  memory::allocation::AllocationPtr underlying_allocation_;
+};
+class TemporaryAllocator : public memory::allocation::Allocator {
+ public:
+  explicit TemporaryAllocator(platform::Place place);
+  void Release(const std::function<void()> &callback);
+  size_t TemporaryAllocationQueueSize();
+  bool IsAllocThreadSafe() const override;
+  void SetCallback(const std::function<void()> &callback);
+ protected:
+  void Free(memory::allocation::Allocation *allocation) override;
+  memory::allocation::Allocation *AllocateImpl(
+      size_t size, memory::allocation::Allocator::Attr attr) override;
+ private:
+  platform::Place place_;
+  // When the allocation is not held by any variable, it should be placed
+  // to temp_mem_queue immediately.
+  std::shared_ptr<std::deque<TemporaryAllocation *>> temp_mem_queue_{nullptr};
+  std::mutex mtx_;
+  size_t wait_delete_mem_{0};
+  std::function<void()> callback_;
+};
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/temporary_allocator.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
+DECLARE_double(limit_of_temporary_allocation);
+namespace paddle {
+namespace platform {
+TEST(temporary_allocator, temporary_allocator) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator alloc(cpu_place);
+  alloc.Allocate(100);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  auto allocation = gpu_alloc.Allocate(101);
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  {
+    auto allocation = gpu_alloc.Allocate(102);
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  }
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+TEST(temporary_allocator, add_callback) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_limit_of_temporary_allocation = 10;
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx =
+      static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
+  auto stream = dev_ctx->stream();
+  bool deleted = false;
+  gpu_alloc.SetCallback([stream, &deleted]() {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+    PADDLE_ENFORCE(cudaGetLastError());
+    deleted = true;
+  });
+  { gpu_alloc.Allocate(100); }
+  PADDLE_ENFORCE(deleted);
+  FLAGS_limit_of_temporary_allocation = -1;
+#endif
+}
+TEST(temporary_allocator, create_tensor_with_allocationptr) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator cpu_alloc(cpu_place);
+  {
+    size_t memory_size = 200;
+    auto allocation = cpu_alloc.Allocate(memory_size);
+    void* address = allocation->ptr();
+    int numel = memory_size / sizeof(float);
+    framework::Tensor tensor =
+        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+  }
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  {
+    size_t memory_size = 300;
+    auto allocation = gpu_alloc.Allocate(memory_size);
+    void* address = allocation->ptr();
+    int numel = memory_size / sizeof(float);
+    framework::Tensor tensor =
+        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+  }
+  // The allocation is not holded now, it should be placed to
+  // TemporaryAllocationQueue.
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+TEST(temporary_allocator, create_tensor_with_allocationptr2) {
+  platform::CPUPlace cpu_place;
+  TemporaryAllocator cpu_alloc(cpu_place);
+  {
+    size_t memory_size = 400;
+    int numel = memory_size / sizeof(float);
+    framework::Tensor out_side_tensor;
+    void* address;
+    {
+      auto allocation = cpu_alloc.Allocate(memory_size);
+      address = allocation->ptr();
+      framework::Tensor tensor = GetTensor<float>(
+          std::move(allocation), framework::make_ddim({numel}));
+      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+      out_side_tensor.ShareDataWith(tensor);
+    }
+    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
+    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
+  }
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu_place(0);
+  TemporaryAllocator gpu_alloc(gpu_place);
+  {
+    void* address;
+    size_t memory_size = 500;
+    int numel = memory_size / sizeof(float);
+    framework::Tensor out_side_tensor;
+    {
+      auto allocation = gpu_alloc.Allocate(memory_size);
+      address = allocation->ptr();
+      framework::Tensor tensor = GetTensor<float>(
+          std::move(allocation), framework::make_ddim({numel}));
+      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
+      out_side_tensor.ShareDataWith(tensor);
+    }
+    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
+    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
+    // The allocation is holded by out_side_tensor.
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+    gpu_alloc.Release([]() {});
+    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+  }
+  // The allocation is not holded now, it should be placed to
+  // TemporaryAllocationQueue.
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
+  gpu_alloc.Release([]() {});
+  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
+#endif
+}
+}  //  namespace platform
+}  //  namespace paddle
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -35,16 +35,26 @@ add_executable(demo_trainer demo_trainer.cc)
 if(WITH_MKLDNN)
  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
+  if(WIN32)
+    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
+  else(WIN32)
    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-endif()
+  endif(WIN32)
+endif(WITH_MKLDNN)
 if(WITH_MKL)
  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  if(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
+  else(WIN32)
    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+  endif(WIN32)
 else()
  if(APPLE)
    set(MATH_LIB cblas)
-  else(APPLE)
+  elseif(WIN32)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
+  else()
    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
  endif(APPLE)
 endif()

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -48,18 +48,13 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 IF(WIN32)
    # Python would use the .pyd by default under Windows series platform
    set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
-    get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY)
    set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
-    add_custom_command(OUTPUT ${FLUID_CORE}
-            COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-            COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR}
-            DEPENDS paddle_pybind)
 ELSE()
    set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
-    add_custom_command(OUTPUT ${FLUID_CORE}
+ENDIF()
+add_custom_command(OUTPUT ${FLUID_CORE}
        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
        DEPENDS paddle_pybind)
-ENDIF()
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 IF(WIN32)

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,6 +22,8 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import slim
+from .slim import *
 from . import utils
 from .utils import *
@@ -30,4 +32,5 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += slim.__all__
 __all__ += utils.__all__
--- a/python/paddle/fluid/contrib/slim/__init__.py
+++ b/python/paddle/fluid/contrib/slim/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .core import *
+from .graph import *
+from .prune import *
+__all__ = [
+    'build_compressor',
+    'CompressPass',
+    'ImitationGraph',
+    'SensitivePruneStrategy',
+    'MagnitudePruner',
+    'RatioPruner',
+]
--- a/python/paddle/fluid/contrib/slim/core/__init__.py
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import config
+from .config import *
+from . import compress_pass
+from .compress_pass import *
+from . import strategy
+from .strategy import *
+from . import pass_builder
+from .pass_builder import *
+__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__
--- a/python/paddle/fluid/contrib/slim/core/compress_pass.py
+++ b/python/paddle/fluid/contrib/slim/core/compress_pass.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ....core import CPUPlace
+from ..graph import get_executor
+__all__ = ['Context', 'CompressPass']
+class Context(object):
+    """
+    The context in the process of compression.
+    Args:
+        exe: The executor used to execute graph.
+        graph: The graph to be compressed.
+        scope: The scope used to execute graph.
+        program_exe: The program_exe is used to execute the program
+                     created for modifying the variables in scope.
+    """
+    def __init__(self, exe, graph, scope, program_exe=None):
+        # The total number of epoches to be trained.
+        self.epoch = 0
+        # Current epoch
+        self.epoch_id = 0
+        # Current batch
+        self.batch_id = 0
+        self.exe = exe
+        self.graph = graph
+        self.scope = scope
+        self.program_exe = program_exe
+class CompressPass(object):
+    """
+    The pass used to compress model.
+    Args:
+        place: The device used in compression.
+        data_reader: The data_reader used to run graph.
+        data_feeder: The data_feeder used to run graph.
+        scope: The scope used to run graph.
+        metrics: The metrics for evaluating model.
+        epoch: The total epoches of trainning in compression.
+        program_exe: The program_exe is used to execute the program
+                     created for modifying the variables in scope.
+    """
+    def __init__(self,
+                 place=None,
+                 data_reader=None,
+                 data_feeder=None,
+                 scope=None,
+                 metrics=None,
+                 epoch=None,
+                 program_exe=None):
+        self.strategies = []
+        self.place = CPUPlace() if place is None else place
+        self.data_reader = data_reader
+        self.data_feeder = data_feeder
+        self.scope = scope
+        self.metrics = metrics
+        self.epoch = epoch
+        self.program_exe = program_exe
+    def add_strategy(self, strategy):
+        """
+        Add a strategy to current compress pass.
+        Args:
+            strategy: The strategy to be added into current compress pass.
+        """
+        self.strategies.append(strategy)
+        self.epoch = max(strategy.end_epoch, self.epoch)
+    def apply(self, graph):
+        """
+        Compress a model.
+        Args:
+            graph: The target graph to be compressed.
+        """
+        self.executor = get_executor(graph, self.place)
+        context = Context(
+            self.executor, graph, self.scope, program_exe=self.program_exe)
+        for strategy in self.strategies:
+            strategy.on_compress_begin(context)
+        for epoch in range(self.epoch):
+            for strategy in self.strategies:
+                strategy.on_epoch_begin(context)
+            for data in self.data_reader():
+                for strategy in self.strategies:
+                    strategy.on_batch_begin(context)
+                fetches = None
+                if self.metrics:
+                    fetches = self.metrics.values()
+                feed = None
+                if self.data_feeder:
+                    feed = self.data_feeder.feed(data)
+                results = self.executor.run(graph,
+                                            fetches=fetches,
+                                            scope=self.scope,
+                                            feed=feed)
+                if results:
+                    print("results: {}".format(
+                        zip(self.metrics.keys(), results)))
+                for strategy in self.strategies:
+                    strategy.on_batch_end(context)
+                context.batch_id += 1
+            for strategy in self.strategies:
+                strategy.on_epoch_end(context)
+            context.epoch_id += 1
+        for strategy in self.strategies:
+            strategy.on_compress_end(context)
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ b/python/paddle/fluid/contrib/slim/core/config.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import funcsigs
+import yaml
+from collections import OrderedDict
+from ..prune import *
+from .compress_pass import *
+from .strategy import *
+__all__ = ['ConfigFactory']
+"""This factory is used to create instances by loading and parsing configure file with yaml format.
+"""
+class ConfigFactory(object):
+    def __init__(self, config):
+        """Init a factory from configure file."""
+        self.instances = {}
+        self.version = None
+        self._parse_config(config)
+    def get_compress_pass(self):
+        """
+        Get compress pass from factory.
+        """
+        return self.instance('compress_pass')
+    def instance(self, name):
+        """
+        Get instance from factory.
+        """
+        if name in self.instances:
+            return self.instances[name]
+        else:
+            return None
+    def _new_instance(self, name, attrs):
+        if name not in self.instances:
+            class_ = globals()[attrs['class']]
+            sig = funcsigs.signature(class_.__init__)
+            keys = [
+                param.name for param in sig.parameters.values()
+                if (param.kind == param.POSITIONAL_OR_KEYWORD)
+            ][1:]
+            keys = set(attrs.keys()).intersection(set(keys))
+            args = {}
+            for key in keys:
+                value = attrs[key]
+                if isinstance(value, str) and value in self.instances:
+                    value = self.instances[value]
+                args[key] = value
+            self.instances[name] = class_(**args)
+        return self.instances.get(name)
+    def _parse_config(self, config):
+        assert config
+        with open(config, 'r') as config_file:
+            key_values = self._ordered_load(config_file)
+            for key in key_values:
+                # parse version
+                if key == 'version' and self.version is None:
+                    self.version = int(key_values['version'])
+                    assert self.version == int(key_values['version'])
+                # parse pruners
+                if key == 'pruners' or key == 'strategies':
+                    instances = key_values[key]
+                    for name in instances:
+                        self._new_instance(name, instances[name])
+                if key == 'compress_pass':
+                    compress_pass = self._new_instance(key, key_values[key])
+                    for name in key_values[key]['strategies']:
+                        strategy = self.instance(name)
+                        compress_pass.add_strategy(strategy)
+                if key == 'include':
+                    for config_file in key_values[key]:
+                        self._parse_config(config_file.strip())
+    def _ordered_load(self,
+                      stream,
+                      Loader=yaml.Loader,
+                      object_pairs_hook=OrderedDict):
+        """
+        See: https://stackoverflow.com/questions/5121931/in-python-how-can-you-load-yaml-mappings-as-ordereddicts
+        """
+        class OrderedLoader(Loader):
+            pass
+        def construct_mapping(loader, node):
+            loader.flatten_mapping(node)
+            return object_pairs_hook(loader.construct_pairs(node))
+        OrderedLoader.add_constructor(
+            yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping)
+        return yaml.load(stream, OrderedLoader)
--- a/python/paddle/fluid/contrib/slim/core/pass_builder.py
+++ b/python/paddle/fluid/contrib/slim/core/pass_builder.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .compress_pass import CompressPass
+from .config import ConfigFactory
+__all__ = ['build_compressor']
+def build_compressor(place=None,
+                     data_reader=None,
+                     data_feeder=None,
+                     scope=None,
+                     metrics=None,
+                     epoch=None,
+                     config=None):
+    if config is not None:
+        factory = ConfigFactory(config)
+        comp_pass = factory.get_compress_pass()
+    else:
+        comp_pass = CompressPass()
+    comp_pass.place = place
+    comp_pass.data_reader = data_reader
+    comp_pass.data_feeder = data_feeder
+    comp_pass.scope = scope
+    comp_pass.metrics = metrics
+    comp_pass.epoch = epoch
+    return comp_pass
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__all__ = ['Strategy']
+class Strategy(object):
+    """
+    Base class for all strategies.
+    """
+    def __init__(self, start_epoch=0, end_epoch=10):
+        """
+        Args:
+            start_epoch: The first epoch to apply the strategy.
+            end_epoch: The last epoch to apply the strategy.
+        """
+        self.start_epoch = start_epoch
+        self.end_epoch = end_epoch
+    def on_compress_begin(self, context):
+        pass
+    def on_epoch_begin(self, context):
+        pass
+    def on_epoch_end(self, context):
+        pass
+    def on_batch_begin(self, context):
+        pass
+    def on_batch_end(self, context):
+        pass
+    def on_compress_end(self, context):
+        pass
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.3
+            'conv1_2.w': 0.4
+            '*': 0.9
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
+strategies:
+    strategy_1:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 0
+        end_epoch: 10
+        delta_rate: 0.20
+        acc_loss_threshold: 0.2
+        sensitivities:
+            'conv1_1.w': 0.4
+compress_pass:
+    class: 'CompressPass'
+    epoch: 100
+    strategies:
+        - strategy_1
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+import paddle
+import os
+import sys
+from paddle.fluid.contrib.slim import CompressPass
+from paddle.fluid.contrib.slim import build_compressor
+from paddle.fluid.contrib.slim import ImitationGraph
+class LinearModel(object):
+    def __init__(slef):
+        pass
+    def train(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        startup_program.random_seed = 10
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+            eval_program = train_program.clone()
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+        train_reader = paddle.batch(
+            paddle.dataset.uci_housing.train(), batch_size=1)
+        eval_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=1)
+        place = fluid.CPUPlace()
+        train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+        train_metrics = {"loss": avg_cost.name}
+        eval_metrics = {"loss": avg_cost.name}
+        graph = ImitationGraph(train_program)
+        config = './config.yaml'
+        comp_pass = build_compressor(
+            place,
+            data_reader=train_reader,
+            data_feeder=train_feeder,
+            scope=fluid.global_scope(),
+            metrics=train_metrics,
+            epoch=1,
+            config=config)
+        comp_pass.apply(graph)
+if __name__ == "__main__":
+    model = LinearModel()
+    model.train()
--- a/python/paddle/fluid/contrib/slim/graph/__init__.py
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import executor
+from .executor import *
+from . import graph
+from .graph import *
+from . import graph_pass
+from .graph_pass import *
+__all__ = executor.__all__
+__all__ += graph.__all__
+__all__ += graph_pass.__all__
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import abc
+from abc import abstractmethod
+from .... import executor
+from .graph import IRGraph, ImitationGraph
+__all__ = ['get_executor']
+class GraphExecutor(object):
+    __metaclass__ = abc.ABCMeta
+    def __init__(self, place):
+        self.place = place
+    @abstractmethod
+    def run(self, graph, feches=None, feed=None):
+        pass
+class IRGraphExecutor(GraphExecutor):
+    def run(self, grah, fetches, feed=None):
+        pass
+class ImitationGraphExecutor(GraphExecutor):
+    def __init__(self, place):
+        super(ImitationGraphExecutor, self).__init__(place)
+        self.exe = executor.Executor(place)
+    def run(self, graph, scope=None, fetches=None, feed=None):
+        assert isinstance(graph, ImitationGraph)
+        fetch_list = None
+        if fetches:
+            fetch_list = [
+                graph.program.global_block().var(name) for name in fetches
+            ]
+        results = self.exe.run(graph.program,
+                               scope=scope,
+                               fetch_list=fetch_list,
+                               feed=feed)
+        return results
+def get_executor(graph, place):
+    if isinstance(graph, ImitationGraph):
+        return ImitationGraphExecutor(place)
+    if isinstance(graph, IRGraph):
+        return IRGraphExecutor(place)
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ....framework import Program
+__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
+class Graph(object):
+    """
+    Base class for all graph.
+    """
+    def __init__(self):
+        pass
+    def all_parameters(self):
+        """
+        Return all the parameters in current graph.
+        """
+        pass
+class ImitationGraph(Graph):
+    def __init__(self, program=None):
+        super(ImitationGraph, self).__init__()
+        self.program = Program() if program is None else program
+    def all_parameters(self):
+        return self.program.global_block().all_parameters()
+class IRGraph(Graph):
+    pass
--- a/python/paddle/fluid/contrib/slim/graph/graph_pass.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__all__ = ['GraphPass', 'PruneParameterPass']
+class GraphPass(object):
+    """
+    Base class for all graph pass.
+    """
+    def __init__(self):
+        pass
+    def apply(self, graph):
+        pass
+class PruneParameterPass(GraphPass):
+    """
+    Generate a graph for pruning parameters from target graph.
+    """
+    def __init__(self, pruned_params, thresholds):
+        super(PruneParameterPass, self).__init__()
+        self.pruned_params = pruned_params
+        self.thresholds = thresholds
+        self.default_threshold = thresholds['*']
+    def apply(self, graph):
+        pass
--- a/python/paddle/fluid/contrib/slim/prune/__init__.py
+++ b/python/paddle/fluid/contrib/slim/prune/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import pruner
+from .pruner import *
+from . import prune_strategy
+from .prune_strategy import *
+__all__ = pruner.__all__
+__all__ += prune_strategy.__all__
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..core.strategy import Strategy
+from ....framework import Program, program_guard
+from .... import layers
+import numpy as np
+__all__ = ['SensitivePruneStrategy', 'PruneStrategy']
+class SensitivePruneStrategy(Strategy):
+    def __init__(self,
+                 pruner=None,
+                 start_epoch=0,
+                 end_epoch=10,
+                 delta_rate=0.20,
+                 acc_loss_threshold=0.2,
+                 sensitivities=None):
+        super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch)
+        self.pruner = pruner
+        self.delta_rate = delta_rate
+        self.acc_loss_threshold = acc_loss_threshold
+        self.sensitivities = sensitivities
+class PruneStrategy(Strategy):
+    """
+    The strategy that pruning weights by threshold or ratio iteratively.
+    """
+    def __init__(self,
+                 pruner,
+                 mini_batch_pruning_frequency=1,
+                 start_epoch=0,
+                 end_epoch=10):
+        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
+        self.pruner = pruner
+        self.mini_batch_pruning_frequency = mini_batch_pruning_frequency
+    def _triger(self, context):
+        return (context.batch_id % self.mini_batch_pruning_frequency == 0 and
+                self.start_epoch <= context.epoch_id < self.end_epoch)
+    def on_batch_end(self, context):
+        if self._triger(context):
+            prune_program = Program()
+            with program_guard(prune_program):
+                for param in context.graph.all_parameters():
+                    prune_program.global_block().clone_variable(param)
+                    p = prune_program.global_block().var(param.name)
+                    zeros_mask = self.pruner.prune(p)
+                    pruned_param = p * zeros_mask
+                    layers.assign(input=pruned_param, output=param)
+            context.program_exe.run(prune_program, scope=context.scope)
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from .... import layers
+__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner']
+class Pruner(object):
+    """
+    Base class of all pruners.
+    """
+    def __init__(self):
+        pass
+    def prune(self, param):
+        pass
+class MagnitudePruner(Pruner):
+    """
+    Pruner used to pruning a parameter by threshold.
+    """
+    def __init__(self, threshold):
+        self.threshold = threshold
+    def prune(self, param, threshold=None):
+        if threshold is None:
+            thres = layers.fill_constant(
+                shape=[1], dtype='float32', value=self.threshold)
+        else:
+            thres = threshold
+        zeros_mask = layers.less_than(x=param, y=thres)
+        return zeros_mask
+class RatioPruner(Pruner):
+    """
+    Pruner used to pruning a parameter by ratio.
+    """
+    def __init__(self, ratios=None):
+        """
+        Args:
+            ratios: dict with pair (paramer_name, pruned_ratio). 
+        """
+        self.ratios = ratios
+    def prune(self, param, ratio=None):
+        """
+        Args:
+            ratio: `ratio=40%` means pruning (1 - 40%) weights to zero.
+        """
+        if ratio is None:
+            rat = self.ratios[
+                param.name] if param.name in self.ratios else self.ratios['*']
+        else:
+            rat = ratio
+        if rat < 1.0:
+            k = max(int(rat * np.prod(param.shape)), 1)
+            param_vec = layers.reshape(x=param, shape=[1, -1])
+            param_topk, _ = layers.topk(param_vec, k=k)
+            threshold = layers.slice(
+                param_topk, axes=[1], starts=[-1], ends=[k])
+            threshold = layers.reshape(x=threshold, shape=[1])
+            zeros_mask = layers.less_than(x=param, y=threshold)
+        else:
+            zeros_mask = layers.ones(param.shape)
+        return zeros_mask
--- a/python/paddle/fluid/contrib/slim/unitest/__init__.py
+++ b/python/paddle/fluid/contrib/slim/unitest/__init__.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
+version: 1.0
+include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"]
+pruners:
+    pruner_1:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.3
+            'conv1_2.w': 0.4
+            '*': 0.9
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
+strategies:
+    strategy_1:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_2'
+        start_epoch: 0
+        end_epoch: 10
+        delta_rate: 0.20
+        acc_loss_threshold: 0.2
+        sensitivities:
+            'conv1_1.w': 0.4
+compress_pass:
+    class: 'CompressPass'
+    epoch: 100
+    strategies:
+        - strategy_1
--- a/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
+version: 1.0
+pruners:
+    pruner_2:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.5
+            'conv1_2.w': 0.2
+            '*': 0.7
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
--- a/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
+++ b/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
+version: 1.0
+pruners:
+    pruner_3:
+        class: 'RatioPruner'
+        ratios:
+            'conv1_1.w': 0.5
+            'conv1_2.w': 0.2
+            '*': 0.7
+        group_dims:
+            '*': [1, 2, 3]
+        criterions:
+            '*': 'l1-norm'
--- a/python/paddle/fluid/contrib/slim/unitest/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/unitest/test_factory.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.fluid.contrib.slim import ConfigFactory
+import unittest
+class TestFactory(unittest.TestCase):
+    def test_parse(self):
+        factory = ConfigFactory('./unitest/configs/config.yaml')
+        pruner = factory.instance('pruner_1')
+        self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
+        pruner = factory.instance('pruner_2')
+        self.assertEquals(pruner.ratios['*'], 0.7)
+        strategy = factory.instance('strategy_1')
+        pruner = strategy.pruner
+        self.assertEquals(pruner.ratios['*'], 0.7)
+        compress_pass = factory.get_compress_pass()
+        self.assertEquals(compress_pass.epoch, 100)
+        strategy = compress_pass.strategies[0]
+        self.assertEquals(strategy.delta_rate, 0.2)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 try:
-    from .graphviz import Digraph
+    from .graphviz import Graph
 except ImportError:
    logger.info(
        'Cannot import graphviz, which is required for drawing a network. This '
@@ -112,7 +112,7 @@ def draw_graph(startup_program, main_program, **kwargs):
    filename = kwargs.get("filename")
    if filename == None:
        filename = str(graph_id) + ".gv"
-    g = Digraph(
+    g = Graph(
        name=str(graph_id),
        filename=filename,
        graph_attr=GRAPH_STYLE,

--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
@@ -23,16 +23,6 @@ class TestTransposeMKLDNN(TestTransposeOp):
    def init_op_type(self):
        self.op_type = "transpose2"
        self.use_mkldnn = True
-        self.is_test = True
-        return
-    def test_check_grad(self):
-        return
-    def test_check_grad_no_input(self):
-        return
-    def test_check_grad_no_filter(self):
        return

--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -27,7 +27,6 @@ class TestTransposeOp(OpTest):
        self.attrs = {
            'axis': list(self.axis),
            'use_mkldnn': self.use_mkldnn,
-            'is_test': self.is_test,
        }
        self.outputs = {
            'XShape': np.random.random(self.shape).astype("float32"),
@@ -37,7 +36,6 @@ class TestTransposeOp(OpTest):
    def init_op_type(self):
        self.op_type = "transpose2"
        self.use_mkldnn = False
-        self.is_test = False
    def test_check_output(self):
        self.check_output(no_check_set=['XShape'])

--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -9,3 +9,5 @@ Pillow
 nltk>=3.2.2
 graphviz
 six
+funcsigs
+pyyaml
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,6 +109,10 @@ packages=['paddle',
          'paddle.fluid.contrib',
          'paddle.fluid.contrib.decoder',
          'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.slim',
+          'paddle.fluid.contrib.slim.core',
+          'paddle.fluid.contrib.slim.graph',
+          'paddle.fluid.contrib.slim.prune',
          'paddle.fluid.contrib.utils',
          'paddle.fluid.transpiler',
          'paddle.fluid.transpiler.details']
@@ -140,8 +144,6 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                   '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
-if os.name == 'nt':
-    package_data['paddle.fluid'] += ['openblas' + ext_name]
 if '${WITH_FLUID_ONLY}'== 'OFF':
    package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
@@ -166,11 +168,17 @@ package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') +
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 if '${WITH_MKL}' == 'ON':
-    shutil.copy('${MKLML_LIB}', libs_path)
+    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
-    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
-    package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
+    package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name]
+else:
+    if os.name == 'nt':
+        # copy the openblas.dll
+        shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path)
+        package_data['paddle.libs'] += ['openblas' + ext_name]
 if '${WITH_MKLDNN}' == 'ON':
-    if '${CMAKE_BUILD_TYPE}' == 'Release':
+    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
        # only change rpath in Release mode.
        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
        # we can support mkl on mac.
@@ -181,7 +189,7 @@ if '${WITH_MKLDNN}' == 'ON':
        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
        if os.system(command) != 0:
            raise Exception("patch libmkldnn.so failed, command: %s" % command)
-    package_data['paddle.libs']+=['libmkldnn.so.0']
+    package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)]
    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 if '${WITH_NGRAPH}' == 'ON':
    # only change rpath in Release mode,