From 2f3b393d151afd6e82cb88095cb2079a9be65f61 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Tue, 31 Aug 2021 18:09:18 +0800 Subject: [PATCH] New whl release strategy with pruned nv_fatbin (#35239) [Background] Expansion in code size can be irreversible in the long run, leading to huge release packages which not only hampers user experience but also exceeds a hard limit of pypi. In such, NV_FATBIN section takes up 86% of the compiled dylib size, owing to the vast number of GPU arches supported. This PR aims to prune this NV_FATBIN. [Solution] In the new release strategy, two types of whl packages will be involved: Cubin PIP package: PIP package maintains a smaller window for GPU arches support, containing sm_60, sm_70, sm_75, sm_80 cubins, covering Pascal - Ampere arches JIT release package: This is a backup for Cubin PIP package, containing compute_35, compute_50, compute_60, compute_70, compute_75, compute_80, with best performance and GPU arches coverage. However, it takes around 10 min to install due to the JIT compilation. [How to use] The new release strategy is disabled by default. To compile for Cubin PIP package, add this to cmake: -DCUBIN_RELEASE_PIP To compile for JIT release package, add this to cmake: -DJIT_RELEASE_WHL --- CMakeLists.txt | 2 ++ cmake/cuda.cmake | 20 +++++++++++++++++++- python/paddle/__init__.py | 2 +- python/setup.py.in | 19 ++++++++++++++++--- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 83191254f1..0f25e7d9dc 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,6 +222,8 @@ option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF) option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF) option(WITH_STRIP "Strip so files of Whl packages" OFF) +option(NEW_RELEASE_CUBIN "PaddlePaddle next-level release strategy for pypi cubin package" OFF) +option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup jit package" OFF) # PY_VERSION if(NOT PY_VERSION) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index e1a9324650..f9c896acd8 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -3,10 +3,22 @@ if(NOT WITH_GPU) endif() -if (WITH_NV_JETSON) +if(WITH_NV_JETSON) add_definitions(-DWITH_NV_JETSON) set(paddle_known_gpu_archs "53 62 72") set(paddle_known_gpu_archs10 "53 62 72") +elseif(NEW_RELEASE_CUBIN) + message("Using New Release Strategy - Cubin Packge") + add_definitions(-DNEW_RELEASE_CUBIN) + set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86") + set(paddle_known_gpu_archs10 "50 60 70 75") + set(paddle_known_gpu_archs11 "60 70 75 80") +elseif(NEW_RELEASE_JIT) + message("Using New Release Strategy - JIT Packge") + add_definitions(-DNEW_RELEASE_JIT) + set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86") + set(paddle_known_gpu_archs10 "35 50 60 70 75") + set(paddle_known_gpu_archs11 "35 50 60 70 75 80") else() set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80") set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75") @@ -130,11 +142,17 @@ function(select_nvcc_arch_flags out_variable) set(cuda_arch_bin ${CUDA_ARCH_BIN}) endif() + if(NEW_RELEASE_JIT) + set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}") + set(cuda_arch_bin "") + endif() + # remove dots and convert to lists string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}") string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + list(REMOVE_DUPLICATES cuda_arch_bin) list(REMOVE_DUPLICATES cuda_arch_ptx) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 1c38d51979..ce338275b2 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -14,7 +14,7 @@ try: from paddle.version import full_version as __version__ from paddle.version import commit as __git_commit__ - + from paddle.cuda_env import * except ImportError: import sys sys.stderr.write('''Warning with import paddle: you should not diff --git a/python/setup.py.in b/python/setup.py.in index 4990544926..8f9f973d93 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -106,6 +106,20 @@ def mkl(): write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py') +def write_cuda_env_config_py(filename='paddle/cuda_env.py'): + cnt = "" + if '${JIT_RELEASE_WHL}' == 'ON': + cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY +# +import os +os.environ['CUDA_CACHE_MAXSIZE'] = '805306368' +''' + + with open(filename, 'w') as f: + f.write(cnt) + +write_cuda_env_config_py(filename='@PADDLE_BINARY_DIR@/python/paddle/cuda_env.py') + def write_distributed_training_mode_py(filename='paddle/fluid/incubate/fleet/parameter_server/version.py'): cnt = '''from __future__ import print_function @@ -414,11 +428,10 @@ class InstallCommand(InstallCommandBase): def finalize_options(self): ret = InstallCommandBase.finalize_options(self) self.install_lib = self.install_platlib - self.install_headers = os.path.join(self.install_platlib, 'paddle', - 'include') + self.install_headers = os.path.join(self.install_platlib, 'paddle', 'include') + return ret - class InstallHeaders(Command): """Override how headers are copied. """ -- GitLab