From 210fa77770193d99e016bab967a209c0bc57a424 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Mon, 8 Aug 2022 10:24:31 +0800 Subject: [PATCH] nvcclazylinux (#44957) --- CMakeLists.txt | 696 ++++++++++-------- cmake/experimental.cmake | 17 + .../cuda_module_loading_lazy.cmake | 55 ++ tools/nvcc_lazy.sh | 70 ++ 4 files changed, 537 insertions(+), 301 deletions(-) create mode 100644 cmake/experimental.cmake create mode 100644 cmake/experiments/cuda_module_loading_lazy.cmake create mode 100644 tools/nvcc_lazy.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b830484127..c4286292b01 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,12 +13,12 @@ # limitations under the License if(APPLE AND WITH_ARM) - # cmake 3.19.2 version starts to support M1 - cmake_minimum_required(VERSION 3.19.2) - cmake_policy(VERSION 3.19.2) + # cmake 3.19.2 version starts to support M1 + cmake_minimum_required(VERSION 3.19.2) + cmake_policy(VERSION 3.19.2) else(APPLE AND WITH_ARM) - cmake_minimum_required(VERSION 3.15) - cmake_policy(VERSION 3.10) + cmake_minimum_required(VERSION 3.15) + cmake_policy(VERSION 3.10) endif(APPLE AND WITH_ARM) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) @@ -28,9 +28,12 @@ include(system) # Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release" CACHE STRING - "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" - FORCE) + set(CMAKE_BUILD_TYPE + "Release" + CACHE + STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" + FORCE) endif() project(paddle CXX C) @@ -39,152 +42,181 @@ project(paddle CXX C) # TODO(Shibo Tao): remove find_package(CUDA) completely. find_package(CUDA QUIET) find_package(MKL CONFIG QUIET) -option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF) -option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) -option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) -option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) -option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF) -option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF) -option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) -option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) -option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) -option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) +option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF) +option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) +option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) +option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) +option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF) +option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF) +option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) +option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) +option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) +option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) # NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON # to develop some acl related functionality on x86 -option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) -option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) -option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) +option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) +option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) +option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) # Note(zhouwei): It use option above, so put here include(init) -include(generic) # simplify cmake module +include(generic) # simplify cmake module +include(experimental) # experimental build options -if (WITH_GPU AND WITH_XPU) - message(FATAL_ERROR "Error when compile GPU and XPU at the same time") +if(WITH_GPU AND WITH_XPU) + message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() -if (WITH_GPU AND WITH_XPU_KP) - message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time") +if(WITH_GPU AND WITH_XPU_KP) + message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time") endif() -if (WITH_GPU AND WITH_ASCEND) - message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") +if(WITH_GPU AND WITH_ASCEND) + message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") endif() -if (WITH_GPU AND WITH_ROCM) - message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") +if(WITH_GPU AND WITH_ROCM) + message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") endif() -if (WITH_GPU AND WITH_MLU) - message(FATAL_ERROR "Error when compile GPU and MLU at the same time") +if(WITH_GPU AND WITH_MLU) + message(FATAL_ERROR "Error when compile GPU and MLU at the same time") endif() if(WITH_GPU AND NOT APPLE) - enable_language(CUDA) - message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: " - "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}") + enable_language(CUDA) + message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: " + "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}") endif() message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " - "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " - "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") message(STATUS "AR tools: ${CMAKE_AR}") # MUSL build turn off warnings if(WITH_MUSL) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy" + ) endif() if(APPLE AND WITH_ARM) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin") - set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin") + set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin") endif() if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() if(WIN32) - option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) - - set(CMAKE_SUPPRESS_REGENERATION ON) - set(CMAKE_STATIC_LIBRARY_PREFIX lib) - - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj") - - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Zc:inline") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zc:inline") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline") + option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) + + set(CMAKE_SUPPRESS_REGENERATION ON) + set(CMAKE_STATIC_LIBRARY_PREFIX lib) + + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj") + + if("${CMAKE_GENERATOR}" STREQUAL "Ninja") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Zc:inline") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zc:inline") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline") + endif() + + if(MSVC_STATIC_CRT) + message( + STATUS + "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019" + ) + foreach( + flag_var + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL + CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL + CMAKE_C_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif() + endforeach(flag_var) + endif() + + # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally + add_definitions("-DNOMINMAX") + # windows build turn off warnings, use parallel compiling. + foreach( + flag_var + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL + CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL + CMAKE_C_FLAGS_RELWITHDEBINFO) + string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") + + # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling, + # For Visual Studio generators, /MP should be added. + # For other generators like Ninja, it is not need to add /MP. + if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU) + math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") + set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") endif() - - if (MSVC_STATIC_CRT) - message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019") - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE - CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif() - endforeach(flag_var) + endforeach(flag_var) + foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) + set(${flag_var} "${${flag_var}} /w") + endforeach(flag_var) + + # Windows Remove /Zi, /ZI for Release, MinSizeRel builds + foreach(flag_var + CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL) + if(${flag_var} MATCHES "/Z[iI]") + string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}") endif() + endforeach(flag_var) + + set(CMAKE_C_FLAGS + "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838" + ) + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838" + ) + + foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS + CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) + set(${flag_var} + "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221") + if(MSVC_STATIC_CRT) + set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB") + endif() + endforeach(flag_var) - # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally - add_definitions("-DNOMINMAX") - # windows build turn off warnings, use parallel compiling. - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE - CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") - - # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling, - # For Visual Studio generators, /MP should be added. - # For other generators like Ninja, it is not need to add /MP. - if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU) - math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") - set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") - endif() - endforeach(flag_var) - foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) - set(${flag_var} "${${flag_var}} /w") - endforeach(flag_var) - - # Windows Remove /Zi, /ZI for Release, MinSizeRel builds - foreach(flag_var - CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL) - if(${flag_var} MATCHES "/Z[iI]") - string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}") - endif() - endforeach(flag_var) - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838") + if(WITH_WIN_DUMP_DBG) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi") - foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) - set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221") - if(MSVC_STATIC_CRT) - set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB") - endif() + foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS + CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) + set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF") endforeach(flag_var) - if (WITH_WIN_DUMP_DBG) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi") - - foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) - set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF") - endforeach(flag_var) - - add_definitions("-DWITH_WIN_DUMP_DBG") - endif() + add_definitions("-DWITH_WIN_DUMP_DBG") + endif() else(WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations" + ) endif(WIN32) find_package(Git REQUIRED) @@ -192,7 +224,7 @@ find_package(Git REQUIRED) # config GIT_URL with github mirrors to speed up dependent repos clone option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL}) if(NOT GIT_URL) - set(GIT_URL "https://github.com") + set(GIT_URL "https://github.com") endif() find_package(Threads REQUIRED) @@ -200,58 +232,75 @@ find_package(Threads REQUIRED) include(simd) ################################ Exposed Configurations ####################################### -option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) -option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) -option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) -option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) -option(WITH_SYSTEM_BLAS "Use system blas library" OFF) -option(WITH_DISTRIBUTE "Compile with distributed support" OFF) -option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) -option(ON_INFER "Turn on inference optimization and inference-lib generation" OFF) +option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) +option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) +option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) +option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) +option(WITH_SYSTEM_BLAS "Use system blas library" OFF) +option(WITH_DISTRIBUTE "Compile with distributed support" OFF) +option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) +option(ON_INFER "Turn on inference optimization and inference-lib generation" + OFF) ################################ Internal Configurations ####################################### -option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF) -option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) -option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) -option(WITH_INCREMENTAL_COVERAGE "Generate coverage reports only for incremental code" OFF) -OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF) -option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) -option(WITH_PSLIB "Compile with pslib support" OFF) -option(WITH_BOX_PS "Compile with box_ps support" OFF) -option(WITH_XBYAK "Compile with xbyak support" ON) -option(WITH_CONTRIB "Compile the third-party contributation" OFF) -option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) -option(WITH_HETERPS "Compile with heterps" OFF}) -option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) -option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) -option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) -option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) -option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) -option(WITH_CINN "Compile PaddlePaddle with CINN" OFF) -option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF) -option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) -option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) -option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) -option(WITH_CNCL "Compile PaddlePaddle with CNCL support" OFF) -option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) -option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) -option(WITH_SW "Compile PaddlePaddle with sw support" OFF) -option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF) -option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) -option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF) -option(WITH_STRIP "Strip so files of Whl packages" OFF) -option(NEW_RELEASE_PYPI "PaddlePaddle next-level release strategy for pypi cubin package" OFF) -option(NEW_RELEASE_ALL "PaddlePaddle next-level release strategy for all arches cubin package" OFF) -option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup jit package" OFF) -option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) -option(WITH_POCKETFFT "Compile with pocketfft support" ON) -option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF) -option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) +option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF) +option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" + OFF) +option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) +option(WITH_INCREMENTAL_COVERAGE + "Generate coverage reports only for incremental code" OFF) +option(WITH_LIBXSMM "Compile with libxsmm" OFF) +option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) +option(WITH_PSLIB "Compile with pslib support" OFF) +option(WITH_BOX_PS "Compile with box_ps support" OFF) +option(WITH_XBYAK "Compile with xbyak support" ON) +option(WITH_CONTRIB "Compile the third-party contributation" OFF) +option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) +option(WITH_HETERPS "Compile with heterps" OFF}) +option(WITH_INFERENCE_API_TEST + "Test fluid inference C++ high-level api interface" OFF) +option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) +option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) +option( + SANITIZER_TYPE + "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" + OFF) +option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) +option(WITH_CINN "Compile PaddlePaddle with CINN" OFF) +option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF) +option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) +option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) +option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) +option(WITH_CNCL "Compile PaddlePaddle with CNCL support" OFF) +option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) +option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) +option(WITH_SW "Compile PaddlePaddle with sw support" OFF) +option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF) +option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) +option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF) +option(WITH_STRIP "Strip so files of Whl packages" OFF) +option(NEW_RELEASE_PYPI + "PaddlePaddle next-level release strategy for pypi cubin package" OFF) +option(NEW_RELEASE_ALL + "PaddlePaddle next-level release strategy for all arches cubin package" + OFF) +option(NEW_RELEASE_JIT + "PaddlePaddle next-level release strategy for backup jit package" OFF) +option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) +option(WITH_POCKETFFT "Compile with pocketfft support" ON) +option(WITH_RECORD_BUILDTIME + "Compile PaddlePaddle with record all targets build time" OFF) +option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) if(WITH_RECORD_BUILDTIME) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh") - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh") -else() - include(ccache) # set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache + set_property( + GLOBAL PROPERTY RULE_LAUNCH_COMPILE + "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh") + set_property( + GLOBAL PROPERTY RULE_LAUNCH_LINK + "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh") +else() + include(ccache + )# set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache endif() unset(WITH_RECORD_BUILDTIME CACHE) @@ -261,186 +310,224 @@ if(NOT PY_VERSION) endif() set(PYBIND11_PYTHON_VERSION ${PY_VERSION}) - # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF -if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$") +if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES + "^(Address|Leak|Memory|Thread|Undefined)$") message("Choose the correct type of sanitizer") return() endif() -if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER) -set(WITH_CUSTOM_DEVICE ON) +if(LINUX + AND NOT WITH_CUSTOM_DEVICE + AND NOT ON_INFER) + set(WITH_CUSTOM_DEVICE ON) endif() if(WIN32) - if(WITH_DISTRIBUTE) - MESSAGE(WARNING - "Disable DISTRIBUTE when compiling for Windows. Force WITH_DISTRIBUTE=OFF.") - set(WITH_DISTRIBUTE OFF CACHE STRING - "Disable DISTRIBUTE when compiling for Windows" FORCE) - endif() - if(WITH_NCCL) - MESSAGE(WARNING - "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.") - set(WITH_NCCL OFF CACHE STRING - "Disable NCCL when compiling for Windows" FORCE) - endif() -endif() - -if (NOT WITH_GPU AND WITH_NCCL) - MESSAGE(WARNING - "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.") - set(WITH_NCCL OFF CACHE STRING - "Disable NCCL when compiling without GPU" FORCE) + if(WITH_DISTRIBUTE) + message( + WARNING + "Disable DISTRIBUTE when compiling for Windows. Force WITH_DISTRIBUTE=OFF." + ) + set(WITH_DISTRIBUTE + OFF + CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) + endif() + if(WITH_NCCL) + message( + WARNING "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.") + set(WITH_NCCL + OFF + CACHE STRING "Disable NCCL when compiling for Windows" FORCE) + endif() +endif() + +if(NOT WITH_GPU AND WITH_NCCL) + message( + WARNING "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.") + set(WITH_NCCL + OFF + CACHE STRING "Disable NCCL when compiling without GPU" FORCE) endif() # force WITH_XPU on when WITH_XPU_KP -if (WITH_XPU_KP AND NOT WITH_XPU) - MESSAGE(WARNING - "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.") - set(WITH_XPU ON CACHE STRING - "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE) +if(WITH_XPU_KP AND NOT WITH_XPU) + message( + WARNING + "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.") + set(WITH_XPU + ON + CACHE STRING "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE) endif() -if (NOT WITH_XPU AND WITH_XPU_BKCL) - MESSAGE(WARNING - "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.") - set(WITH_XPU_BKCL OFF CACHE STRING - "Disable BKCL when compiling without XPU" FORCE) +if(NOT WITH_XPU AND WITH_XPU_BKCL) + message( + WARNING "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.") + set(WITH_XPU_BKCL + OFF + CACHE STRING "Disable BKCL when compiling without XPU" FORCE) endif() -if (NOT WITH_MLU AND WITH_CNCL) - MESSAGE(WARNING - "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.") - set(WITH_MLU OFF CACHE STRING - "Disable CNCL when compiling without MLU" FORCE) +if(NOT WITH_MLU AND WITH_CNCL) + message( + WARNING "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.") + set(WITH_MLU + OFF + CACHE STRING "Disable CNCL when compiling without MLU" FORCE) endif() if(WITH_NCCL) - add_definitions("-DPADDLE_WITH_NCCL") - include(nccl) + add_definitions("-DPADDLE_WITH_NCCL") + include(nccl) else() - if(WITH_GPU) - MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.") - endif() + if(WITH_GPU) + message( + WARNING + "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used." + ) + endif() endif() if(WITH_BRPC_RDMA) - message(STATUS "Use brpc with rdma.") - if(NOT WITH_DISTRIBUTE) - message(FATAL_ERROR "Can't use brpc rdma in no distribute env.") - endif() + message(STATUS "Use brpc with rdma.") + if(NOT WITH_DISTRIBUTE) + message(FATAL_ERROR "Can't use brpc rdma in no distribute env.") + endif() endif() - if(WITH_GPU) - include(cuda) - # lite subgraph compilation depends on CUDNN_ROOT, - # so include(cudnn) needs to be in front of include(third_party/lite) - include(cudnn) # set cudnn libraries, must before configure - include(tensorrt) - # there is no official support of nccl, cupti in windows - if(NOT WIN32) - include(cupti) - endif() + include(cuda) + # lite subgraph compilation depends on CUDNN_ROOT, + # so include(cudnn) needs to be in front of include(third_party/lite) + include(cudnn) # set cudnn libraries, must before configure + include(tensorrt) + # there is no official support of nccl, cupti in windows + if(NOT WIN32) + include(cupti) + endif() endif() if(WITH_MLU) - include(neuware) + include(neuware) endif() if(WITH_ROCM) - include(hip) - include(miopen) # set miopen libraries, must before configure + include(hip) + include(miopen) # set miopen libraries, must before configure endif(WITH_ROCM) if(WITH_XPU_KP) - include(xpu_kp) + include(xpu_kp) endif() -if (NOT WITH_ROCM AND WITH_RCCL) - MESSAGE(WARNING - "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.") - set(WITH_RCCL OFF CACHE STRING - "Disable RCCL when compiling without ROCM" FORCE) +if(NOT WITH_ROCM AND WITH_RCCL) + message( + WARNING "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.") + set(WITH_RCCL + OFF + CACHE STRING "Disable RCCL when compiling without ROCM" FORCE) endif() if(WITH_RCCL) - add_definitions("-DPADDLE_WITH_RCCL") - include(rccl) + add_definitions("-DPADDLE_WITH_RCCL") + include(rccl) else() - if(WITH_ROCM) - MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.") - endif() + if(WITH_ROCM) + message( + WARNING + "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used." + ) + endif() endif() if(WITH_HETERPS AND WITH_PSLIB) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() if(WITH_DISTRIBUTE) - if(LINUX) - set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE) - endif() - if(WITH_ASCEND_CL) - # disable WITH_PSCORE for NPU before include third_party - MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.") - set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE) - endif() -endif() - -include(third_party) # download, build, install third_party, Contains about 20+ dependencies - -include(flags) # set paddle compile flags + if(LINUX) + set(WITH_GLOO + ON + CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE) + endif() + if(WITH_ASCEND_CL) + # disable WITH_PSCORE for NPU before include third_party + message( + WARNING + "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.") + set(WITH_PSCORE + OFF + CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE) + endif() +endif() + +include(third_party +)# download, build, install third_party, Contains about 20+ dependencies + +include(flags) # set paddle compile flags if(WITH_PROFILER) - find_package(Gperftools REQUIRED) - include_directories(${GPERFTOOLS_INCLUDE_DIR}) - add_definitions(-DWITH_GPERFTOOLS) + find_package(Gperftools REQUIRED) + include_directories(${GPERFTOOLS_INCLUDE_DIR}) + add_definitions(-DWITH_GPERFTOOLS) endif() -include(util) # set unittest and link libs -include(version) # set PADDLE_VERSION -include(coveralls) # set code coverage -include(configure) # add paddle env configuration +include(util) # set unittest and link libs +include(version) # set PADDLE_VERSION +include(coveralls) # set code coverage +include(configure) # add paddle env configuration include_directories("${PADDLE_SOURCE_DIR}") if(WITH_NV_JETSON) - set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE) + set(WITH_ARM + ON + CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE) endif() if(WITH_ARM) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") - set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE) - set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE) - set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE) - add_definitions(-DPADDLE_WITH_ARM) -endif() - -if (WITH_SW) - # mieee flag solves floating-point exceptions under sw and ALPHA architectures - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee") - set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE) - set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE) - add_definitions(-DPADDLE_WITH_SW) -endif() - -if (WITH_MIPS) - set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_MIPS=ON" FORCE) - add_definitions(-DPADDLE_WITH_MIPS) -endif() - -if (WITH_ONEMKL) - add_definitions(-DPADDLE_WITH_ONEMKL) -endif() - -if (WITH_HETERPS) - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new") - endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") + set(WITH_XBYAK + OFF + CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE) + set(WITH_MKL + OFF + CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE) + set(WITH_AVX + OFF + CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE) + add_definitions(-DPADDLE_WITH_ARM) +endif() + +if(WITH_SW) + # mieee flag solves floating-point exceptions under sw and ALPHA architectures + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee") + set(WITH_XBYAK + OFF + CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE) + set(WITH_MKL + OFF + CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE) + add_definitions(-DPADDLE_WITH_SW) +endif() + +if(WITH_MIPS) + set(WITH_XBYAK + OFF + CACHE STRING "Disable XBYAK when compiling WITH_MIPS=ON" FORCE) + add_definitions(-DPADDLE_WITH_MIPS) +endif() + +if(WITH_ONEMKL) + add_definitions(-DPADDLE_WITH_ONEMKL) +endif() + +if(WITH_HETERPS) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new") + endif() endif() set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") @@ -450,25 +537,32 @@ set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") add_definitions(-DPADDLE_DLL_EXPORT) if(ON_INFER) - # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF - message(STATUS "On inference mode, will take place some specific optimization.") - include(inference_lib) - add_definitions(-DPADDLE_ON_INFERENCE) + # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF + message( + STATUS "On inference mode, will take place some specific optimization.") + include(inference_lib) + add_definitions(-DPADDLE_ON_INFERENCE) else() - #TODO(luotao), combine this warning with `make inference_lib_dist` command. - message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") + #TODO(luotao), combine this warning with `make inference_lib_dist` command. + message( + WARNING + "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only." + ) endif() if(WITH_STRIP) - find_program(STRIP_PATH strip) - if(NOT STRIP_PATH OR NOT LINUX) - set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE) - endif() + find_program(STRIP_PATH strip) + if(NOT STRIP_PATH OR NOT LINUX) + set(WITH_STRIP + OFF + CACHE STRING "Command strip is only used on Linux when it exists." + FORCE) + endif() endif() add_subdirectory(paddle) if(WITH_PYTHON) - add_subdirectory(python) + add_subdirectory(python) endif() get_directory_property(all_inc_dirs INCLUDE_DIRECTORIES) diff --git a/cmake/experimental.cmake b/cmake/experimental.cmake new file mode 100644 index 00000000000..df98a86a0a8 --- /dev/null +++ b/cmake/experimental.cmake @@ -0,0 +1,17 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# this file contains experimental build options + +include(experiments/cuda_module_loading_lazy) diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake new file mode 100644 index 00000000000..f4ab829b285 --- /dev/null +++ b/cmake/experiments/cuda_module_loading_lazy.cmake @@ -0,0 +1,55 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# this file contains experimental build options for lazy cuda module loading +# cuda moduel lazy loading is supported by CUDA 11.7+ +# this experiment option makes Paddle supports lazy loading before CUDA 11.7. + +if(LINUX) + if(NOT ${ON_INFER} OR NOT ${LINUX}) + message( + "EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms" + ) + return() + endif() + if(NOT ${CUDA_FOUND}) + message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA") + return() + endif() + if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.7") + message("cuda 11.7+ already support lazy module loading") + return() + endif() + + message( + "for cuda before 11.7, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a" + ) + set(CUDA_USE_STATIC_CUDA_RUNTIME + OFF + CACHE BOOL "" FORCE) + set(CMAKE_CUDA_FLAGS "--cudart shared") + enable_language(CUDA) + execute_process( + COMMAND "rm" "-rf" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" + COMMAND "chmod" "755" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy.sh" + COMMAND "bash" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy.sh" + "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" "${CUDA_TOOLKIT_ROOT_DIR}") + execute_process(COMMAND "chmod" "755" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy") + set(CUDA_NVCC_EXECUTABLE + "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" + CACHE FILEPATH "" FORCE) + set(CMAKE_CUDA_COMPILER + "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" + CACHE FILEPATH "" FORCE) +endif() diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh new file mode 100644 index 00000000000..efb0223ae6c --- /dev/null +++ b/tools/nvcc_lazy.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo "#!/usr/bin/env bash" >> $1 +echo "unset GREP_OPTIONS" >> $1 +echo "set -e" >> $1 +echo -e >> $1 +echo "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved." >> $1 +echo "#" >> $1 +echo "# Licensed under the Apache License, Version 2.0 (the \"License\");" >> $1 +echo "# you may not use this file except in compliance with the License." >> $1 +echo "# You may obtain a copy of the License at" >> $1 +echo "#" >> $1 +echo "# http://www.apache.org/licenses/LICENSE-2.0" >> $1 +echo "#" >> $1 +echo "# Unless required by applicable law or agreed to in writing, software" >> $1 +echo "# distributed under the License is distributed on an \"AS IS\" BASIS," >> $1 +echo "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." >> $1 +echo "# See the License for the specific language governing permissions and" >> $1 +echo "# limitations under the License." >> $1 +echo -e >> $1 +echo -e >> $1 +echo "## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY" >> $1 +echo -e >> $1 +echo "# set cicc PATH for Centos" >> $1 +echo "export PATH=\$PATH:$2/nvvm/bin" >> $1 +echo -e >> $1 +echo "# check nvcc version, if nvcc >= 11.7, just run nvcc itself" >> $1 +echo "CUDA_VERSION=\$(nvcc --version | grep -oP '(?<=V)\d*\.\d*')" >> $1 +echo "CUDA_VERSION_MAJOR=\${CUDA_VERSION%.*}" >> $1 +echo "CUDA_VERSION_MINOR=\${CUDA_VERSION#*.}" >> $1 +echo "if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 7) )); then" >> $1 +echo " nvcc \"\$@\"" >> $1 +echo " exit" >> $1 +echo "fi" >> $1 +echo -e >> $1 +echo "BUILDDIR=\$(mktemp -d /tmp/nvcc-lazy-build.XXXXXXXX)" >> $1 +echo "echo \"\$@\" > \${BUILDDIR}/args" >> $1 +echo "BUILDSH=\${BUILDDIR}/build.sh" >> $1 +echo "$2/bin/nvcc --dryrun --keep --keep-dir=\${BUILDDIR} \"\$@\" 2>&1 | sed -e 's/#\\$ //;/^rm/d' > \$BUILDSH" >> $1 +echo "sed -i -e '/^\s*--/d' \$BUILDSH" >> $1 +echo "sed -ne '1,/^cicc.*cudafe1.stub.c/p' \${BUILDSH} > \${BUILDSH}.pre" >> $1 +echo "sed -e '1,/^cicc.*cudafe1.stub.c/d' \${BUILDSH} > \${BUILDSH}.post" >> $1 +echo -e >> $1 +echo "sed -i -e '/LIBRARIES=/{s/\s//g;s/\"\"/ /g}' \${BUILDSH}.pre" >> $1 +echo -e >> $1 +echo "/usr/bin/env bash \${BUILDSH}.pre" >> $1 +echo "STUBF=\$(find \$BUILDDIR -name *.cudafe1.stub.c)" >> $1 +echo "CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1 +echo "sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1 +echo "sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1 +echo "# sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1 +echo "sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1 +echo "sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1 +echo "sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1 +echo "/usr/bin/env bash \${BUILDSH}.post" >> $1 +echo "rm -rf \$BUILDDIR" >> $1 -- GitLab