diff --git a/CMakeLists.txt b/CMakeLists.txt index efdb451f659a290f5e0db1dec5faff6295675e2d..aa9446a694dd90675f9b938b2409208baf6a3409 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,10 +203,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream -include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of nccl, cupti in windows +# there is no official support of warpctc, nccl, cupti in windows +include(external/warpctc) # download, build, install warpctc include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 5be7be64137be57f078739e5f287dd4bb0dcbd4f..414e92eb27f56e0670e1977e67c2f5ca9c6bbcc2 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -139,12 +139,10 @@ endfunction() message(STATUS "CUDA detected: " ${CUDA_VERSION}) if (${CUDA_VERSION} LESS 7.0) set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) - add_definitions("-DPADDLE_CUDA_BINVER=\"60\"") elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") - add_definitions("-DPADDLE_CUDA_BINVER=\"70\"") elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") @@ -152,7 +150,6 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x # CUDA 8 may complain that sm_20 is no longer supported. Suppress the # warning for now. list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") - add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") endif() include_directories(${CUDA_INCLUDE_DIRS}) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 96a9917e7622ee9c617ea51d63755063bc5a201d..09bec347dbd569203103eccc7dbc0521c291bc0a 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -89,7 +89,6 @@ if(CUDNN_FOUND) if(NOT CUDNN_MAJOR_VERSION) set(CUDNN_VERSION "???") else() - add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 70d159b4f3549662e080794efad8af943ce1f0bc..2ced43f9e6c60da642f7a6252f889d9c9ab9748f 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 566dc75fda019eb66759eb403f60e16f18cffef1..86096d4feaace040da416a01872882456c4098fc 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,43 +57,46 @@ int main() return 0; }" SSE3_FOUND) -# Check AVX -set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; -}" AVX_FOUND) +# disable AVX by default on windows +if(NOT WIN32) + # Check AVX + set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) + set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; + }" AVX_FOUND) -# Check AVX 2 -set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) -set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; -}" AVX2_FOUND) + # Check AVX 2 + set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) + set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; + }" AVX2_FOUND) -# Check AVX512F -set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; -}" AVX512F_FOUND) + # Check AVX512F + set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) + set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; + }" AVX512F_FOUND) +endif(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 90083f690fe9f095920b2b61a4903fd0064877c2..225dfb3e700232ae90d13ed55ab38fbd86c913bd 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -15,7 +15,8 @@ function(windows_symbolic TARGET) file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) add_custom_command(OUTPUT ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu" COMMENT "create hidden file of ${src}.cu") add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index c3a044d22cf04dceecc164fae934ee15c4563af1..6940250c3f9663bbb734d5a6eb78135aecbc3a3b 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable paddle::memory::AllocationPtr gpu_; + mutable memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 2c1648c81fc999c6306d5b08bc243f3ad21fec04..6d39bb3c524b4725dfebd6ef07594b0b45c65463 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,8 +23,7 @@ limitations under the License. */ #include #include -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" // For VLOG() +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 307c348822306f2b50389430df4d8cd2033efc29..3e8fb83e9d5ba2078bcf37e4a4af74708df9c11c 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -17,6 +17,7 @@ limitations under the License. */ #ifdef _WIN32 #include +#include // VirtualLock/VirtualUnlock #else #include // for mlock and munlock #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 95ad67e33e26ea8b40ece4e7a2040c1b9e8b1e38..257bfc0a3f926d20abc4647b27e8e9cc2c49e014 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -44,8 +44,9 @@ endif() register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) + # warpctc_op needs cudnn 7 above -if (WITH_GPU) +if (WITH_GPU AND NOT WIN32) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() @@ -63,7 +64,9 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) +if (NOT WIN32) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) +endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 7c0fda4169b5e1cf663d04b78b6425d73965c292..999fdcff90784ed089cd620a4f0a908f196bcdda 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index bf9aef9135009f1db1604edfc56e5270f87d74d7..c600d1e3d76f7a989dd61e72caf4967aa5923c6f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -19,9 +19,6 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/operators/math/jit_kernel.h" -#if defined(_WIN32) && defined(_WINSOCKAPI_) -#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ -#endif #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index 2e3779ff0845294e71f27801049c010e0a585e6b..ccbd05c82ad6a880d21269092088be9656b35c99 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -17,12 +17,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" -#if defined(_WIN32) -#if defined(__AVX2__) || defined(__AVX__) -inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } -#endif -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h index 2bc740e59832942aa6a8062def9fdd4b0a6a79ff..6abf3434cc8d8f6ab2838ef822a4f6b948331802 100644 --- a/paddle/fluid/operators/math/jit_gen.h +++ b/paddle/fluid/operators/math/jit_gen.h @@ -18,9 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/platform/macros.h" -#if defined(_WIN32) && defined(_WINSOCKAPI_) -#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ -#endif #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 07159d4a12ef4b628f7705ed206d3334be46dfc8..5939c500c946c44579d1de645ac9700c7701a4e9 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,7 +16,9 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) +if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index 91d9a1ef013449e83f2540a6646c96e34347ccc1..f3cd3b2bbedef7c9140c2acddea0732972ff7fa0 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -38,10 +38,6 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); #endif -#ifdef CUDNN_DNN_ROUTINE_EACH_R6 -CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); -#endif - #ifdef CUDNN_DNN_ROUTINE_EACH_R7 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 15d516836652ea4ea4d1bcdf35022e6b79cc3b52..cc5cda6106c188f3156d33480b5d3641eed32556 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -53,12 +53,6 @@ namespace platform { namespace dynload { static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; -#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) -static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; -static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll"; -static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll"; -#endif - static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -171,8 +165,6 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -181,8 +173,6 @@ void* GetCublasDsoHandle() { void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); #endif @@ -203,8 +193,6 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif @@ -213,8 +201,6 @@ void* GetCurandDsoHandle() { void* GetWarpCTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); #endif @@ -239,8 +225,6 @@ void* GetTensorRtDsoHandle() { void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); #endif diff --git a/python/setup.py.in b/python/setup.py.in index f4613dd72de50bb39cb34a60b61a557006220c29..ff3aca5714cca5d0d115dd45e7a754a7ab52c099 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -158,11 +158,10 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' - -package_data['paddle.libs']= [] -package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] -shutil.copy('${WARPCTC_LIBRARIES}', libs_path) - +if os.name != 'nt': + package_data['paddle.libs']= [] + package_data['paddle.libs']=['libwarpctc' + ext_name] + shutil.copy('${WARPCTC_LIBRARIES}', libs_path) if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_LIB}', libs_path) shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)