From 210fa77770193d99e016bab967a209c0bc57a424 Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Mon, 8 Aug 2022 10:24:31 +0800
Subject: [PATCH] nvcclazylinux (#44957)

---
 CMakeLists.txt                                | 696 ++++++++++--------
 cmake/experimental.cmake                      |  17 +
 .../cuda_module_loading_lazy.cmake            |  55 ++
 tools/nvcc_lazy.sh                            |  70 ++
 4 files changed, 537 insertions(+), 301 deletions(-)
 create mode 100644 cmake/experimental.cmake
 create mode 100644 cmake/experiments/cuda_module_loading_lazy.cmake
 create mode 100644 tools/nvcc_lazy.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b830484127..c4286292b01 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,12 +13,12 @@
 # limitations under the License
 
 if(APPLE AND WITH_ARM)
-    # cmake 3.19.2 version starts to support M1
-    cmake_minimum_required(VERSION 3.19.2)
-    cmake_policy(VERSION 3.19.2)
+  # cmake 3.19.2 version starts to support M1
+  cmake_minimum_required(VERSION 3.19.2)
+  cmake_policy(VERSION 3.19.2)
 else(APPLE AND WITH_ARM)
-    cmake_minimum_required(VERSION 3.15)
-    cmake_policy(VERSION 3.10)
+  cmake_minimum_required(VERSION 3.15)
+  cmake_policy(VERSION 3.10)
 endif(APPLE AND WITH_ARM)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
@@ -28,9 +28,12 @@ include(system)
 
 # Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
+  set(CMAKE_BUILD_TYPE
+      "Release"
+      CACHE
+        STRING
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
 endif()
 
 project(paddle CXX C)
@@ -39,152 +42,181 @@ project(paddle CXX C)
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 find_package(MKL CONFIG QUIET)
-option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
-option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
-option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
-option(WITH_XPU_KP      "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
-option(WITH_MLU    "Compile PaddlePaddle with CAMBRICON MLU"     OFF)
-option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
-option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
-option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
-option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
+option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF)
+option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
+option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
+option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
+option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
+option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
+option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
+option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
+option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
+option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
 # NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
 # to develop some acl related functionality on x86
-option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
-option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
-option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"          OFF)
+option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
+option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
+option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 # Note(zhouwei): It use option above, so put here
 include(init)
-include(generic)            # simplify cmake module
+include(generic) # simplify cmake module
+include(experimental) # experimental build options
 
-if (WITH_GPU  AND WITH_XPU)
-    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+if(WITH_GPU AND WITH_XPU)
+  message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU AND WITH_XPU_KP)
-    message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
+if(WITH_GPU AND WITH_XPU_KP)
+  message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
 endif()
-if (WITH_GPU AND WITH_ASCEND)
-    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
+if(WITH_GPU AND WITH_ASCEND)
+  message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
-if (WITH_GPU AND WITH_ROCM)
-    message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
+if(WITH_GPU AND WITH_ROCM)
+  message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
-if (WITH_GPU AND WITH_MLU)
-    message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
+if(WITH_GPU AND WITH_MLU)
+  message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
 endif()
 
 if(WITH_GPU AND NOT APPLE)
-    enable_language(CUDA)
-    message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
-        "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}")
+  enable_language(CUDA)
+  message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
+                 "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}")
 endif()
 
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
-        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+               "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
-        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+               "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")
 
 # MUSL build turn off warnings
 if(WITH_MUSL)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy"
+  )
 endif()
 
 if(APPLE AND WITH_ARM)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
-    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
 endif()
 
 if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
 
 if(WIN32)
-    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-
-    set(CMAKE_SUPPRESS_REGENERATION ON)
-    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
-
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
-
-    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
-        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
-        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
-        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+  option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+
+  set(CMAKE_SUPPRESS_REGENERATION ON)
+  set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj")
+  set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
+
+  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+  endif()
+
+  if(MSVC_STATIC_CRT)
+    message(
+      STATUS
+        "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019"
+    )
+    foreach(
+      flag_var
+      CMAKE_CXX_FLAGS
+      CMAKE_CXX_FLAGS_DEBUG
+      CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_MINSIZEREL
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS
+      CMAKE_C_FLAGS_DEBUG
+      CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_MINSIZEREL
+      CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif()
+    endforeach(flag_var)
+  endif()
+
+  # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally
+  add_definitions("-DNOMINMAX")
+  # windows build turn off warnings, use parallel compiling.
+  foreach(
+    flag_var
+    CMAKE_CXX_FLAGS
+    CMAKE_CXX_FLAGS_DEBUG
+    CMAKE_CXX_FLAGS_RELEASE
+    CMAKE_CXX_FLAGS_MINSIZEREL
+    CMAKE_CXX_FLAGS_RELWITHDEBINFO
+    CMAKE_C_FLAGS
+    CMAKE_C_FLAGS_DEBUG
+    CMAKE_C_FLAGS_RELEASE
+    CMAKE_C_FLAGS_MINSIZEREL
+    CMAKE_C_FLAGS_RELWITHDEBINFO)
+    string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+
+    # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
+    # For Visual Studio generators, /MP should be added.
+    # For other generators like Ninja, it is not need to add /MP.
+    if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU)
+      math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+      set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
     endif()
-
-    if (MSVC_STATIC_CRT)
-        message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
-        foreach(flag_var
-            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-            if(${flag_var} MATCHES "/MD")
-                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-            endif()
-        endforeach(flag_var)
+  endforeach(flag_var)
+  foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+    set(${flag_var} "${${flag_var}} /w")
+  endforeach(flag_var)
+
+  # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
+  foreach(flag_var
+          CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
+    if(${flag_var} MATCHES "/Z[iI]")
+      string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
     endif()
+  endforeach(flag_var)
+
+  set(CMAKE_C_FLAGS
+      "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838"
+  )
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838"
+  )
+
+  foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
+                   CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+    set(${flag_var}
+        "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+    if(MSVC_STATIC_CRT)
+      set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+    endif()
+  endforeach(flag_var)
 
-    # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally
-    add_definitions("-DNOMINMAX")
-    # windows build turn off warnings, use parallel compiling.
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        
-        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
-        # For Visual Studio generators, /MP should be added. 
-        # For other generators like Ninja, it is not need to add /MP.
-        if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU)
-            math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
-            set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
-        endif()
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-
-    # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
-    foreach(flag_var
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
-        if(${flag_var} MATCHES "/Z[iI]")
-            string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
-        endif()
-    endforeach(flag_var)
-
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
+  if(WITH_WIN_DUMP_DBG)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
 
-    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
-        set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
-        if(MSVC_STATIC_CRT)
-            set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
-        endif()
+    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
+                     CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+      set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF")
     endforeach(flag_var)
 
-    if (WITH_WIN_DUMP_DBG)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
-
-        foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
-            set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF")
-        endforeach(flag_var)
-
-        add_definitions("-DWITH_WIN_DUMP_DBG")
-    endif()
+    add_definitions("-DWITH_WIN_DUMP_DBG")
+  endif()
 
 else(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations"
+  )
 endif(WIN32)
 
 find_package(Git REQUIRED)
@@ -192,7 +224,7 @@ find_package(Git REQUIRED)
 # config GIT_URL with github mirrors to speed up dependent repos clone
 option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
 if(NOT GIT_URL)
-    set(GIT_URL "https://github.com")
+  set(GIT_URL "https://github.com")
 endif()
 
 find_package(Threads REQUIRED)
@@ -200,58 +232,75 @@ find_package(Threads REQUIRED)
 include(simd)
 
 ################################ Exposed Configurations #######################################
-option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
-option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
-option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(ON_INFER         "Turn on inference optimization and inference-lib generation" OFF)
+option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
+option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
+option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
+option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
+option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
+option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
+option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
+option(ON_INFER "Turn on inference optimization and inference-lib generation"
+       OFF)
 ################################ Internal Configurations #######################################
-option(WITH_NV_JETSON   "Compile PaddlePaddle with NV JETSON"             OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
-option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
-option(WITH_INCREMENTAL_COVERAGE    "Generate coverage reports only for incremental code"       OFF)
-OPTION(WITH_LIBXSMM     "Compile with libxsmm"                          OFF)
-option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
-option(WITH_PSLIB       "Compile with pslib support"                    OFF)
-option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
-option(WITH_XBYAK       "Compile with xbyak support"                    ON)
-option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
-option(WITH_HETERPS     "Compile with heterps"                          OFF})
-option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
-option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
-option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
-option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
-option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
-option(WITH_CINN   "Compile PaddlePaddle with CINN" OFF)
-option(WITH_INFRT  "Compile PaddlePaddle with INFRT" OFF)
-option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
-option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
-option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
-option(WITH_CNCL   "Compile PaddlePaddle with CNCL support"             OFF)
-option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
-option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
-option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
-option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
-option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
-option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
-option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
-option(NEW_RELEASE_PYPI   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
-option(NEW_RELEASE_ALL   "PaddlePaddle next-level release strategy for all arches cubin package"             OFF)
-option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup jit package"             OFF)
-option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
-option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
-option(WITH_RECORD_BUILDTIME    "Compile PaddlePaddle with record all targets build time"       OFF)
-option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
+option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools"
+       OFF)
+option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
+option(WITH_INCREMENTAL_COVERAGE
+       "Generate coverage reports only for incremental code" OFF)
+option(WITH_LIBXSMM "Compile with libxsmm" OFF)
+option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
+option(WITH_PSLIB "Compile with pslib support" OFF)
+option(WITH_BOX_PS "Compile with box_ps support" OFF)
+option(WITH_XBYAK "Compile with xbyak support" ON)
+option(WITH_CONTRIB "Compile the third-party contributation" OFF)
+option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
+option(WITH_HETERPS "Compile with heterps" OFF})
+option(WITH_INFERENCE_API_TEST
+       "Test fluid inference C++ high-level api interface" OFF)
+option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
+option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
+option(
+  SANITIZER_TYPE
+  "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined"
+  OFF)
+option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
+option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
+option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF)
+option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
+option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
+option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
+option(WITH_CNCL "Compile PaddlePaddle with CNCL support" OFF)
+option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
+option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
+option(WITH_SW "Compile PaddlePaddle with sw support" OFF)
+option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF)
+option(WITH_MUSL "Compile with musl libc instead of gblic" OFF)
+option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF)
+option(WITH_STRIP "Strip so files of Whl packages" OFF)
+option(NEW_RELEASE_PYPI
+       "PaddlePaddle next-level release strategy for pypi cubin package" OFF)
+option(NEW_RELEASE_ALL
+       "PaddlePaddle next-level release strategy for all arches cubin package"
+       OFF)
+option(NEW_RELEASE_JIT
+       "PaddlePaddle next-level release strategy for backup jit package" OFF)
+option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
+option(WITH_POCKETFFT "Compile with pocketfft support" ON)
+option(WITH_RECORD_BUILDTIME
+       "Compile PaddlePaddle with record all targets build time" OFF)
+option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF)
 
 if(WITH_RECORD_BUILDTIME)
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
-else()            
-    include(ccache) # set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
+  set_property(
+    GLOBAL PROPERTY RULE_LAUNCH_COMPILE
+                    "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
+  set_property(
+    GLOBAL PROPERTY RULE_LAUNCH_LINK
+                    "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
+else()
+  include(ccache
+  )# set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
 endif()
 unset(WITH_RECORD_BUILDTIME CACHE)
 
@@ -261,186 +310,224 @@ if(NOT PY_VERSION)
 endif()
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
-
 # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF
-if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
+if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES
+                      "^(Address|Leak|Memory|Thread|Undefined)$")
   message("Choose the correct type of sanitizer")
   return()
 endif()
 
-if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER)
-set(WITH_CUSTOM_DEVICE ON)
+if(LINUX
+   AND NOT WITH_CUSTOM_DEVICE
+   AND NOT ON_INFER)
+  set(WITH_CUSTOM_DEVICE ON)
 endif()
 
 if(WIN32)
-    if(WITH_DISTRIBUTE)
-        MESSAGE(WARNING
-            "Disable DISTRIBUTE when compiling for Windows. Force WITH_DISTRIBUTE=OFF.")
-        set(WITH_DISTRIBUTE OFF CACHE STRING
-            "Disable DISTRIBUTE when compiling for Windows" FORCE)
-    endif()
-    if(WITH_NCCL)
-        MESSAGE(WARNING
-            "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
-        set(WITH_NCCL OFF CACHE STRING
-            "Disable NCCL when compiling for Windows" FORCE)
-    endif()
-endif()
-
-if (NOT WITH_GPU AND WITH_NCCL)
-    MESSAGE(WARNING
-        "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
-    set(WITH_NCCL OFF CACHE STRING
-        "Disable NCCL when compiling without GPU" FORCE)
+  if(WITH_DISTRIBUTE)
+    message(
+      WARNING
+        "Disable DISTRIBUTE when compiling for Windows. Force WITH_DISTRIBUTE=OFF."
+    )
+    set(WITH_DISTRIBUTE
+        OFF
+        CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE)
+  endif()
+  if(WITH_NCCL)
+    message(
+      WARNING "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
+    set(WITH_NCCL
+        OFF
+        CACHE STRING "Disable NCCL when compiling for Windows" FORCE)
+  endif()
+endif()
+
+if(NOT WITH_GPU AND WITH_NCCL)
+  message(
+    WARNING "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
+  set(WITH_NCCL
+      OFF
+      CACHE STRING "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
 # force WITH_XPU on when WITH_XPU_KP
-if (WITH_XPU_KP AND NOT WITH_XPU)
-    MESSAGE(WARNING
-        "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
-    set(WITH_XPU ON CACHE STRING
-        "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
+if(WITH_XPU_KP AND NOT WITH_XPU)
+  message(
+    WARNING
+      "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
+  set(WITH_XPU
+      ON
+      CACHE STRING "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
 endif()
 
-if (NOT WITH_XPU AND WITH_XPU_BKCL)
-    MESSAGE(WARNING
-        "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
-    set(WITH_XPU_BKCL OFF CACHE STRING
-        "Disable BKCL when compiling without XPU" FORCE)
+if(NOT WITH_XPU AND WITH_XPU_BKCL)
+  message(
+    WARNING "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
+  set(WITH_XPU_BKCL
+      OFF
+      CACHE STRING "Disable BKCL when compiling without XPU" FORCE)
 endif()
 
-if (NOT WITH_MLU AND WITH_CNCL)
-    MESSAGE(WARNING
-        "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
-    set(WITH_MLU OFF CACHE STRING
-        "Disable CNCL when compiling without MLU" FORCE)
+if(NOT WITH_MLU AND WITH_CNCL)
+  message(
+    WARNING "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
+  set(WITH_MLU
+      OFF
+      CACHE STRING "Disable CNCL when compiling without MLU" FORCE)
 endif()
 
 if(WITH_NCCL)
-     add_definitions("-DPADDLE_WITH_NCCL")
-     include(nccl)
+  add_definitions("-DPADDLE_WITH_NCCL")
+  include(nccl)
 else()
-     if(WITH_GPU)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
+  if(WITH_GPU)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
 endif()
 
 if(WITH_BRPC_RDMA)
-    message(STATUS "Use brpc with rdma.")
-    if(NOT WITH_DISTRIBUTE)
-        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
-    endif()
+  message(STATUS "Use brpc with rdma.")
+  if(NOT WITH_DISTRIBUTE)
+    message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+  endif()
 endif()
 
-
 if(WITH_GPU)
-    include(cuda)
-    # lite subgraph compilation depends on CUDNN_ROOT,
-    # so include(cudnn) needs to be in front of include(third_party/lite)
-    include(cudnn)              # set cudnn libraries, must before configure
-    include(tensorrt)
-    # there is no official support of nccl, cupti in windows
-    if(NOT WIN32)
-        include(cupti)
-    endif()
+  include(cuda)
+  # lite subgraph compilation depends on CUDNN_ROOT,
+  # so include(cudnn) needs to be in front of include(third_party/lite)
+  include(cudnn) # set cudnn libraries, must before configure
+  include(tensorrt)
+  # there is no official support of nccl, cupti in windows
+  if(NOT WIN32)
+    include(cupti)
+  endif()
 endif()
 
 if(WITH_MLU)
-    include(neuware)
+  include(neuware)
 endif()
 
 if(WITH_ROCM)
-    include(hip)
-    include(miopen) # set miopen libraries, must before configure
+  include(hip)
+  include(miopen) # set miopen libraries, must before configure
 endif(WITH_ROCM)
 
 if(WITH_XPU_KP)
-    include(xpu_kp)
+  include(xpu_kp)
 endif()
 
-if (NOT WITH_ROCM AND WITH_RCCL)
-    MESSAGE(WARNING
-        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
-    set(WITH_RCCL OFF CACHE STRING
-        "Disable RCCL when compiling without ROCM" FORCE)
+if(NOT WITH_ROCM AND WITH_RCCL)
+  message(
+    WARNING "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+  set(WITH_RCCL
+      OFF
+      CACHE STRING "Disable RCCL when compiling without ROCM" FORCE)
 endif()
 
 if(WITH_RCCL)
-     add_definitions("-DPADDLE_WITH_RCCL")
-     include(rccl)
+  add_definitions("-DPADDLE_WITH_RCCL")
+  include(rccl)
 else()
-     if(WITH_ROCM)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
+  if(WITH_ROCM)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
 endif()
 
 if(WITH_HETERPS AND WITH_PSLIB)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
 
 if(WITH_DISTRIBUTE)
-    if(LINUX)
-        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
-    endif()
-    if(WITH_ASCEND_CL)
-        # disable WITH_PSCORE for NPU before include third_party
-        MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
-        set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
-    endif()
-endif()
-
-include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
-
-include(flags)              # set paddle compile flags
+  if(LINUX)
+    set(WITH_GLOO
+        ON
+        CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
+  endif()
+  if(WITH_ASCEND_CL)
+    # disable WITH_PSCORE for NPU before include third_party
+    message(
+      WARNING
+        "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
+    set(WITH_PSCORE
+        OFF
+        CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
+  endif()
+endif()
+
+include(third_party
+)# download, build, install third_party, Contains about 20+ dependencies
+
+include(flags) # set paddle compile flags
 
 if(WITH_PROFILER)
-    find_package(Gperftools REQUIRED)
-    include_directories(${GPERFTOOLS_INCLUDE_DIR})
-    add_definitions(-DWITH_GPERFTOOLS)
+  find_package(Gperftools REQUIRED)
+  include_directories(${GPERFTOOLS_INCLUDE_DIR})
+  add_definitions(-DWITH_GPERFTOOLS)
 endif()
 
-include(util)               # set unittest and link libs
-include(version)            # set PADDLE_VERSION
-include(coveralls)          # set code coverage
-include(configure)          # add paddle env configuration
+include(util) # set unittest and link libs
+include(version) # set PADDLE_VERSION
+include(coveralls) # set code coverage
+include(configure) # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
 if(WITH_NV_JETSON)
-    set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
+  set(WITH_ARM
+      ON
+      CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
 endif()
 
 if(WITH_ARM)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE)
-    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
-    set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE)
-    add_definitions(-DPADDLE_WITH_ARM)
-endif()
-
-if (WITH_SW)
-    # mieee flag solves floating-point exceptions under sw and ALPHA architectures
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
-    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
-    add_definitions(-DPADDLE_WITH_SW)
-endif()
-
-if (WITH_MIPS)
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_MIPS=ON" FORCE)
-    add_definitions(-DPADDLE_WITH_MIPS)
-endif()
-
-if (WITH_ONEMKL)
-    add_definitions(-DPADDLE_WITH_ONEMKL)
-endif()
-
-if (WITH_HETERPS)
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
-    endif()
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE)
+  set(WITH_MKL
+      OFF
+      CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
+  set(WITH_AVX
+      OFF
+      CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE)
+  add_definitions(-DPADDLE_WITH_ARM)
+endif()
+
+if(WITH_SW)
+  # mieee flag solves floating-point exceptions under sw and ALPHA architectures
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
+  set(WITH_MKL
+      OFF
+      CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
+  add_definitions(-DPADDLE_WITH_SW)
+endif()
+
+if(WITH_MIPS)
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_MIPS=ON" FORCE)
+  add_definitions(-DPADDLE_WITH_MIPS)
+endif()
+
+if(WITH_ONEMKL)
+  add_definitions(-DPADDLE_WITH_ONEMKL)
+endif()
+
+if(WITH_HETERPS)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+  endif()
 endif()
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
@@ -450,25 +537,32 @@ set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 add_definitions(-DPADDLE_DLL_EXPORT)
 
 if(ON_INFER)
-    # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
-    message(STATUS "On inference mode, will take place some specific optimization.")
-    include(inference_lib)
-    add_definitions(-DPADDLE_ON_INFERENCE)
+  # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
+  message(
+    STATUS "On inference mode, will take place some specific optimization.")
+  include(inference_lib)
+  add_definitions(-DPADDLE_ON_INFERENCE)
 else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+  #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+  message(
+    WARNING
+      "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only."
+  )
 endif()
 
 if(WITH_STRIP)
-    find_program(STRIP_PATH strip)
-    if(NOT STRIP_PATH OR NOT LINUX)
-        set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
-    endif()
+  find_program(STRIP_PATH strip)
+  if(NOT STRIP_PATH OR NOT LINUX)
+    set(WITH_STRIP
+        OFF
+        CACHE STRING "Command strip is only used on Linux when it exists."
+              FORCE)
+  endif()
 endif()
 
 add_subdirectory(paddle)
 if(WITH_PYTHON)
-    add_subdirectory(python)
+  add_subdirectory(python)
 endif()
 
 get_directory_property(all_inc_dirs INCLUDE_DIRECTORIES)
diff --git a/cmake/experimental.cmake b/cmake/experimental.cmake
new file mode 100644
index 00000000000..df98a86a0a8
--- /dev/null
+++ b/cmake/experimental.cmake
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this file contains experimental build options
+
+include(experiments/cuda_module_loading_lazy)
diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake
new file mode 100644
index 00000000000..f4ab829b285
--- /dev/null
+++ b/cmake/experiments/cuda_module_loading_lazy.cmake
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this file contains experimental build options for lazy cuda module loading
+# cuda moduel lazy loading is supported by CUDA 11.7+
+# this experiment option makes Paddle supports lazy loading before CUDA 11.7.
+
+if(LINUX)
+  if(NOT ${ON_INFER} OR NOT ${LINUX})
+    message(
+      "EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms"
+    )
+    return()
+  endif()
+  if(NOT ${CUDA_FOUND})
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
+    return()
+  endif()
+  if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.7")
+    message("cuda 11.7+ already support lazy module loading")
+    return()
+  endif()
+
+  message(
+    "for cuda before 11.7, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a"
+  )
+  set(CUDA_USE_STATIC_CUDA_RUNTIME
+      OFF
+      CACHE BOOL "" FORCE)
+  set(CMAKE_CUDA_FLAGS "--cudart shared")
+  enable_language(CUDA)
+  execute_process(
+    COMMAND "rm" "-rf" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
+    COMMAND "chmod" "755" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy.sh"
+    COMMAND "bash" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy.sh"
+            "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" "${CUDA_TOOLKIT_ROOT_DIR}")
+  execute_process(COMMAND "chmod" "755" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy")
+  set(CUDA_NVCC_EXECUTABLE
+      "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
+      CACHE FILEPATH "" FORCE)
+  set(CMAKE_CUDA_COMPILER
+      "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
+      CACHE FILEPATH "" FORCE)
+endif()
diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh
new file mode 100644
index 00000000000..efb0223ae6c
--- /dev/null
+++ b/tools/nvcc_lazy.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "#!/usr/bin/env bash" >> $1
+echo "unset GREP_OPTIONS" >> $1
+echo "set -e" >> $1
+echo -e >> $1 
+echo "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved." >> $1
+echo "#" >> $1
+echo "# Licensed under the Apache License, Version 2.0 (the \"License\");" >> $1
+echo "# you may not use this file except in compliance with the License." >> $1
+echo "# You may obtain a copy of the License at" >> $1
+echo "#" >> $1
+echo "#     http://www.apache.org/licenses/LICENSE-2.0" >> $1
+echo "#" >> $1 
+echo "# Unless required by applicable law or agreed to in writing, software" >> $1
+echo "# distributed under the License is distributed on an \"AS IS\" BASIS," >> $1
+echo "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." >> $1
+echo "# See the License for the specific language governing permissions and" >> $1
+echo "# limitations under the License." >> $1
+echo -e >> $1
+echo -e >> $1
+echo "## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY" >> $1
+echo -e >> $1
+echo "# set cicc PATH for Centos" >> $1
+echo "export PATH=\$PATH:$2/nvvm/bin" >> $1
+echo -e >> $1
+echo "# check nvcc version, if nvcc >= 11.7, just run nvcc itself" >> $1
+echo "CUDA_VERSION=\$(nvcc --version | grep -oP '(?<=V)\d*\.\d*')" >> $1
+echo "CUDA_VERSION_MAJOR=\${CUDA_VERSION%.*}" >> $1
+echo "CUDA_VERSION_MINOR=\${CUDA_VERSION#*.}" >> $1
+echo "if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 7) )); then" >> $1
+echo "  nvcc \"\$@\"" >> $1
+echo "  exit" >> $1
+echo "fi" >> $1
+echo -e >> $1
+echo "BUILDDIR=\$(mktemp -d  /tmp/nvcc-lazy-build.XXXXXXXX)" >> $1
+echo "echo \"\$@\" > \${BUILDDIR}/args" >> $1
+echo "BUILDSH=\${BUILDDIR}/build.sh" >> $1
+echo "$2/bin/nvcc --dryrun --keep --keep-dir=\${BUILDDIR} \"\$@\" 2>&1 | sed -e 's/#\\$ //;/^rm/d' > \$BUILDSH" >> $1
+echo "sed -i -e '/^\s*--/d' \$BUILDSH" >> $1
+echo "sed -ne '1,/^cicc.*cudafe1.stub.c/p' \${BUILDSH} > \${BUILDSH}.pre" >> $1
+echo "sed -e '1,/^cicc.*cudafe1.stub.c/d' \${BUILDSH} > \${BUILDSH}.post" >> $1
+echo -e >> $1
+echo "sed -i -e '/LIBRARIES=/{s/\s//g;s/\"\"/ /g}' \${BUILDSH}.pre" >> $1
+echo -e >> $1
+echo "/usr/bin/env bash \${BUILDSH}.pre" >> $1
+echo "STUBF=\$(find \$BUILDDIR -name *.cudafe1.stub.c)" >> $1
+echo "CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1
+echo "sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1
+echo "sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1
+echo "# sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1
+echo "sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1
+echo "sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1
+echo "sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1
+echo "/usr/bin/env bash \${BUILDSH}.post" >> $1
+echo "rm -rf \$BUILDDIR" >> $1
-- 
GitLab