include the mkl fix only

test=develop

include the mkl fix only
test=develop
b601f2de · peizhilin · 001891ae · b601f2de · b601f2de · b601f2de
18 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -203,10 +203,10 @@ include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
-include(external/warpctc)   # download, build, install warpctc

 if (NOT WIN32)
-# there is no official support of nccl, cupti in windows
+# there is no official support of warpctc, nccl, cupti in windows
+include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 include(external/gzstream)
 endif (NOT WIN32)

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -139,12 +139,10 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
-  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@@ -152,7 +150,6 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 endif()

 include_directories(${CUDA_INCLUDE_DIRS})

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -89,7 +89,6 @@ if(CUDNN_FOUND)
        if(NOT CUDNN_MAJOR_VERSION)
            set(CUDNN_VERSION "???")
        else()
-            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
            math(EXPR CUDNN_VERSION
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,7 +84,7 @@ function(op_library TARGET)
    endif()
    if (WIN32)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()

--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,43 +57,46 @@ int main()
    return 0;
 }" SSE3_FOUND)

-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
+# disable AVX by default on windows
+if(NOT WIN32)
+    # Check AVX
+    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
        __m256 result = _mm256_add_ps (a, b);
        return 0;
-}" AVX_FOUND)
+    }" AVX_FOUND)

-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
+    # Check AVX 2
+    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
        __m256i result = _mm256_abs_epi32 (a);
        return 0;
-}" AVX2_FOUND)
+    }" AVX2_FOUND)

-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
+    # Check AVX512F
+    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
                                      13, -5, 6, -7, 9, 2, -6, 3);
        __m512i result = _mm512_abs_epi32 (a);
        return 0;
-}" AVX512F_FOUND)
+    }" AVX512F_FOUND)
+endif(NOT WIN32)

 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -15,7 +15,8 @@ function(windows_symbolic TARGET)
    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)

    add_custom_command(OUTPUT ${final_path}/.${src}.cu
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
+            COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
+            COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
            COMMENT "create hidden file of ${src}.cu")
    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
  endforeach()

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -215,7 +215,7 @@ class Vector {
      auto stream = dev_ctx->stream();
      void *src = gpu_->ptr();
      void *dst = cpu_.data();
-      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
                   gpu_->size(), stream);
      dev_ctx->Wait();
    }
@@ -261,7 +261,7 @@ class Vector {
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          platform::DeviceContextPool::Instance().Get(place));
      auto stream = dev_ctx->stream();
-      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
                   gpu_->size(), stream);
    }

@@ -284,7 +284,7 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }

    mutable std::vector<T> cpu_;
-    mutable paddle::memory::AllocationPtr gpu_;
+    mutable memory::AllocationPtr gpu_;
    mutable int flag_;

    mutable std::mutex mtx_;

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>

-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -17,6 +17,7 @@ limitations under the License. */

 #ifdef _WIN32
 #include <malloc.h>
+#include <windows.h>  // VirtualLock/VirtualUnlock
 #else
 #include <sys/mman.h>  // for mlock and munlock
 #endif

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -44,8 +44,9 @@ endif()

 register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})

+
 # warpctc_op needs cudnn 7 above
-if (WITH_GPU)
+if (WITH_GPU AND NOT WIN32)
    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
    else()
@@ -63,7 +64,9 @@ endif()
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})

 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
+if (NOT WIN32)
+    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
+endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)

--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-
-#include <array>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"

--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -19,9 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"

 #include "paddle/fluid/operators/math/jit_kernel.h"
-#if defined(_WIN32) && defined(_WINSOCKAPI_)
-#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
-#endif
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"


--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -17,12 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"

-#if defined(_WIN32)
-#if defined(__AVX2__) || defined(__AVX__)
-inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
-#endif
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/jit_gen.h
+++ b/paddle/fluid/operators/math/jit_gen.h
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/platform/macros.h"

-#if defined(_WIN32) && defined(_WINSOCKAPI_)
-#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */
-#endif
 #define XBYAK_USE_MMAP_ALLOCATOR
 #include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
+if (NOT WIN32)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+endif(NOT WIN32)
 if (WITH_MKLML)
    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()

--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -38,10 +38,6 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif

-#ifdef CUDNN_DNN_ROUTINE_EACH_R6
-CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
-#endif
-
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -53,12 +53,6 @@ namespace platform {
 namespace dynload {
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;

-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
-#endif
-
 static inline std::string join(const std::string& part1,
                               const std::string& part2) {
  // directory separator
@@ -171,8 +165,6 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
 void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -181,8 +173,6 @@ void* GetCublasDsoHandle() {
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
@@ -203,8 +193,6 @@ void* GetCUPTIDsoHandle() {
 void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
@@ -213,8 +201,6 @@ void* GetCurandDsoHandle() {
 void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll");
 #else
  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
@@ -239,8 +225,6 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
 #else
  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
 #endif

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -158,11 +158,10 @@ if '${WITH_FLUID_ONLY}'== 'OFF':

 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-
-package_data['paddle.libs']= []
-package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
-shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
-
+if os.name != 'nt':
+    package_data['paddle.libs']= []
+    package_data['paddle.libs']=['libwarpctc' + ext_name]
+    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)