New whl release strategy with pruned nv_fatbin (#35239)

[Background] Expansion in code size can be irreversible in the long run, leading to huge release packages which not only hampers user experience but also exceeds a hard limit of pypi. In such, NV_FATBIN section takes up 86% of the compiled dylib size, owing to the vast number of GPU arches supported. This PR aims to prune this NV_FATBIN. [Solution] In the new release strategy, two types of whl packages will be involved: Cubin PIP package: PIP package maintains a smaller window for GPU arches support, containing sm_60, sm_70, sm_75, sm_80 cubins, covering Pascal - Ampere arches JIT release package: This is a backup for Cubin PIP package, containing compute_35, compute_50, compute_60, compute_70, compute_75, compute_80, with best performance and GPU arches coverage. However, it takes around 10 min to install due to the JIT compilation. [How to use] The new release strategy is disabled by default. To compile for Cubin PIP package, add this to cmake: -DCUBIN_RELEASE_PIP To compile for JIT release package, add this to cmake: -DJIT_RELEASE_WHL

New whl release strategy with pruned nv_fatbin (#35239)
[Background] Expansion in code size can be irreversible in the long run, leading to huge release packages which not only hampers user experience but also exceeds a hard limit of pypi. In such, NV_FATBIN section takes up 86% of the compiled dylib size, owing to the vast number of GPU arches supported. This PR aims to prune this NV_FATBIN. [Solution] In the new release strategy, two types of whl packages will be involved: Cubin PIP package: PIP package maintains a smaller window for GPU arches support, containing sm_60, sm_70, sm_75, sm_80 cubins, covering Pascal - Ampere arches JIT release package: This is a backup for Cubin PIP package, containing compute_35, compute_50, compute_60, compute_70, compute_75, compute_80, with best performance and GPU arches coverage. However, it takes around 10 min to install due to the JIT compilation. [How to use] The new release strategy is disabled by default. To compile for Cubin PIP package, add this to cmake: -DCUBIN_RELEASE_PIP To compile for JIT release package, add this to cmake: -DJIT_RELEASE_WHL
2f3b393d · Zhanlue Yang · GitHub · d9f59fd1 · 2f3b393d · 2f3b393d
Showing with 38 addition and 5 deletion

CMakeLists.txt CMakeLists.txt +2 -0

cmake/cuda.cmake cmake/cuda.cmake +19 -1

python/paddle/__init__.py python/paddle/__init__.py +1 -1

python/setup.py.in python/setup.py.in +16 -3

未找到文件。
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -222,6 +222,8 @@ option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
 option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
+option(NEW_RELEASE_CUBIN   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
+option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup jit package"             OFF)

 # PY_VERSION
 if(NOT PY_VERSION)

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -3,10 +3,22 @@ if(NOT WITH_GPU)
 endif()


-if (WITH_NV_JETSON)
+if(WITH_NV_JETSON)
  add_definitions(-DWITH_NV_JETSON)
  set(paddle_known_gpu_archs "53 62 72")
  set(paddle_known_gpu_archs10 "53 62 72")
+elseif(NEW_RELEASE_CUBIN)
+  message("Using New Release Strategy - Cubin Packge")
+  add_definitions(-DNEW_RELEASE_CUBIN)
+  set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs10 "50 60 70 75")
+  set(paddle_known_gpu_archs11 "60 70 75 80")
+elseif(NEW_RELEASE_JIT)
+  message("Using New Release Strategy - JIT Packge")
+  add_definitions(-DNEW_RELEASE_JIT)
+  set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs10 "35 50 60 70 75")
+  set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
 else()
  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
@@ -130,11 +142,17 @@ function(select_nvcc_arch_flags out_variable)
    set(cuda_arch_bin ${CUDA_ARCH_BIN})
  endif()

+  if(NEW_RELEASE_JIT)
+      set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}")
+      set(cuda_arch_bin "")
+  endif()
+
  # remove dots and convert to lists
  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+
  list(REMOVE_DUPLICATES cuda_arch_bin)
  list(REMOVE_DUPLICATES cuda_arch_ptx)


--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -14,7 +14,7 @@
 try:
    from paddle.version import full_version as __version__
    from paddle.version import commit as __git_commit__
-
+    from paddle.cuda_env import *
 except ImportError:
    import sys
    sys.stderr.write('''Warning with import paddle: you should not

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -106,6 +106,20 @@ def mkl():

 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')

+def write_cuda_env_config_py(filename='paddle/cuda_env.py'):
+    cnt = ""
+    if '${JIT_RELEASE_WHL}' == 'ON':
+        cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+import os
+os.environ['CUDA_CACHE_MAXSIZE'] = '805306368'
+'''
+
+    with open(filename, 'w') as f:
+        f.write(cnt)
+
+write_cuda_env_config_py(filename='@PADDLE_BINARY_DIR@/python/paddle/cuda_env.py')
+
 def write_distributed_training_mode_py(filename='paddle/fluid/incubate/fleet/parameter_server/version.py'):
    cnt = '''from __future__ import print_function

@@ -414,10 +428,9 @@ class InstallCommand(InstallCommandBase):
    def finalize_options(self):
        ret = InstallCommandBase.finalize_options(self)
        self.install_lib = self.install_platlib
-        self.install_headers = os.path.join(self.install_platlib, 'paddle',
-                                            'include')
-        return ret
+        self.install_headers = os.path.join(self.install_platlib, 'paddle', 'include')
        
+        return ret

 class InstallHeaders(Command):
    """Override how headers are copied.