From 3ba8c48a161d4183e2791b6fb207ae6640780a25 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Fri, 7 May 2021 15:42:47 +0800 Subject: [PATCH] [CHERRY-PICK2.1]Remove paddle_custom_op dynamic libraries, and link to FLUID_CORE on windows (#32583) (#32769) * Remove paddle_custom_op dynamic libraries, change link to FLUID_CORE on windows, and check copy_to * fix CI --- paddle/fluid/framework/CMakeLists.txt | 33 --- paddle/scripts/paddle_build.bat | 112 +++++---- python/CMakeLists.txt | 17 +- python/paddle/check_import_scipy.py | 2 +- python/paddle/fluid/core.py | 20 +- .../fluid/tests/custom_op/CMakeLists.txt | 5 +- .../fluid/tests/custom_op/custom_relu_op.cu | 6 +- .../fluid/tests/custom_op/test_check_abi.py | 31 ++- .../custom_op/test_custom_relu_op_jit.py | 10 +- .../utils/cpp_extension/cpp_extension.py | 18 +- .../utils/cpp_extension/extension_utils.py | 92 +++++--- python/setup.py.in | 16 +- tools/parallel_UT_rule.py | 218 +++++++++++++++++- 13 files changed, 405 insertions(+), 175 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 24bed27728..0f85464f60 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -369,36 +369,3 @@ cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) if(WITH_TESTING AND TEST selected_rows_test) set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120) endif() - -##### 2.0 New custom op extension mechanism related ##### - -# if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_ -if (WIN32) - set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) - - set(PADDLE_CUSTOM_OP_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc) - set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE) - - cc_library(paddle_custom_op_shared - SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES}) - - get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op) - target_link_libraries(paddle_custom_op_shared ${os_dependency_modules}) - - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}) - else() - set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}) - endif() - set(PADDLE_CUSTOM_OP_IMPORT_LIB - ${paddle_custom_op_lib_path}/paddle_custom_op.lib - CACHE INTERNAL "Paddle custom op import lib") - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${paddle_custom_op_lib_path}/paddle_custom_op.dll - CACHE INTERNAL "Paddle custom op dll") -endif() diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 439c8a4f24..e53828ff10 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -54,14 +54,14 @@ wmic process where name="python.exe" call terminate 2>NUL rem ------initialize common variable------ if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" if not defined BRANCH set BRANCH=develop -if not defined WITH_TENSORRT set WITH_TENSORRT=ON +if not defined WITH_TENSORRT set WITH_TENSORRT=ON if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto if not defined WITH_GPU set WITH_GPU=ON if not defined WITH_MKL set WITH_MKL=ON if not defined WITH_AVX set WITH_AVX=ON if not defined WITH_TESTING set WITH_TESTING=ON -if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=OFF +if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON if not defined WITH_PYTHON set WITH_PYTHON=ON if not defined ON_INFER set ON_INFER=ON if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON @@ -75,6 +75,7 @@ if not defined LOG_LEVEL set LOG_LEVEL=normal if not defined PRECISION_TEST set PRECISION_TEST=OFF if not defined NIGHTLY_MODE set PRECISION_TEST=OFF if not defined retry_times set retry_times=2 +if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 rem -------set cache build directory----------- rmdir build\python /s/q @@ -83,9 +84,6 @@ rmdir build\paddle_inference_install_dir /s/q rmdir build\paddle_inference_c_install_dir /s/q del build\CMakeCache.txt -: set CI_SKIP_CPP_TEST if only *.py changed -git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON - if "%WITH_CACHE%"=="OFF" ( rmdir build /s/q goto :mkbuild @@ -135,58 +133,6 @@ dir . dir %cache_dir% dir paddle\fluid\pybind\Release -rem ------initialize the python environment------ -if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 -set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe -set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH% - -rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled -rem Now use system python environment temporarily -rem %PYTHON_EXECUTABLE% -m pip install virtualenv -rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci -rem call paddle_winci\Scripts\activate.bat - -rem ------pre install python requirement---------- -where python -where pip -pip install wheel --user -pip install -r %work_dir%\python\requirements.txt --user - -if %ERRORLEVEL% NEQ 0 ( - echo pip install requirements.txt failed! - exit /b 7 -) - -rem ------pre install clcache and init config---------- -rem pip install clcache --user -pip uninstall -y clcache -:: set USE_CLCACHE to enable clcache -rem set USE_CLCACHE=1 -:: In some scenarios, CLCACHE_HARDLINK can save one file copy. -rem set CLCACHE_HARDLINK=1 -:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported -rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 -:: set maximum cache size to 20G -rem clcache.exe -M 21474836480 - -:: install ninja if GENERATOR is Ninja -if %GENERATOR% == "Ninja" ( - pip install ninja - if %errorlevel% NEQ 0 ( - echo pip install ninja failed! - exit /b 7 - ) -) - -rem ------show summary of current environment---------- -cmake --version -if "%WITH_GPU%"=="ON" ( - nvcc --version - nvidia-smi -) -::python %work_dir%\tools\summary_env.py -::%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh - goto :CASE_%1 echo "Usage: paddle_build.bat [OPTION]" @@ -266,8 +212,10 @@ rem "Other configurations are added here" rem :CASE_wincheck_others rem call ... + rem --------------------------------------------------------------------------------------------- :cmake +@ECHO OFF echo ======================================== echo Step 1. Cmake ... echo ======================================== @@ -281,12 +229,52 @@ set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH% for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% -@ECHO ON -if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0 +if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH% -rem ------set third_party cache dir------ +rem install ninja if GENERATOR is Ninja +if %GENERATOR% == "Ninja" ( + pip install ninja + if %errorlevel% NEQ 0 ( + echo pip install ninja failed! + exit /b 7 + ) +) +rem ------show summary of current GPU environment---------- +cmake --version +if "%WITH_GPU%"=="ON" ( + nvcc --version + nvidia-smi +) + +rem ------initialize the python environment------ +set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe +set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH% +if %WITH_PYTHON% == "OFF" ( + where python + where pip + pip install wheel --user + pip install -r %work_dir%\python\requirements.txt --user + if %ERRORLEVEL% NEQ 0 ( + echo pip install requirements.txt failed! + exit /b 7 + ) +) + +rem ------pre install clcache and init config---------- +rem pip install clcache --user +pip uninstall -y clcache +:: set USE_CLCACHE to enable clcache +rem set USE_CLCACHE=1 +:: In some scenarios, CLCACHE_HARDLINK can save one file copy. +rem set CLCACHE_HARDLINK=1 +:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported +rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 +:: set maximum cache size to 20G +rem clcache.exe -M 21474836480 + +rem ------set third_party cache dir------ : clear third party cache every once in a while for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# set day_now=%datetime:~6,2% @@ -500,6 +488,10 @@ echo ======================================== echo Step 4. Running unit tests ... echo ======================================== + +: set CI_SKIP_CPP_TEST if only *.py changed +git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON + pip install -r %work_dir%\python\unittest_py\requirements.txt --user if %ERRORLEVEL% NEQ 0 ( echo pip install unittest requirements.txt failed! diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 9b03cd08ba..b493ecedd9 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -43,9 +43,20 @@ set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/) IF(WIN32) # Python would use the .pyd by default under Windows series platform set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd) - set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd) + set(FLUID_CORE_LIB ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.lib) + + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE_LIB} + DEPENDS paddle_pybind) + + set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd) ELSE() set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + DEPENDS paddle_pybind) + set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so) ENDIF() @@ -68,9 +79,6 @@ if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}") list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE}) endif() -add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - DEPENDS paddle_pybind) add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS}) IF(WIN32) @@ -84,6 +92,7 @@ ELSE(WIN32) COMMAND touch stub.cc COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + COMMENT "Packing whl packages------>>>" DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES}) ENDIF() diff --git a/python/paddle/check_import_scipy.py b/python/paddle/check_import_scipy.py index 0172d568e5..d6e13e2a67 100644 --- a/python/paddle/check_import_scipy.py +++ b/python/paddle/check_import_scipy.py @@ -24,6 +24,6 @@ def check_import_scipy(OsName): if 'DLL load failed' in print_info: raise ImportError( print_info + - "\nplease download visual C++ Redistributable for vs 2015, https://www.microsoft.com/en-us/download/details.aspx?id=48145" + "\nplease download Visual C++ Redistributable from https://support.microsoft.com/en-us/topic/the-latest-supported-visual-c-downloads-2647da03-1eea-4433-9aff-95f26a218cc0" ) return diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 49bcaf6dd6..9e931ad40c 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -37,7 +37,10 @@ if os.path.exists(current_path + os.sep + 'core_noavx.' + core_suffix): try: if os.name == 'nt': third_lib_path = current_path + os.sep + '..' + os.sep + 'libs' - os.environ['path'] = third_lib_path + ';' + os.environ['path'] + # Will load shared library from 'path' on windows + os.environ[ + 'path'] = current_path + ';' + third_lib_path + ';' + os.environ[ + 'path'] sys.path.insert(0, third_lib_path) # Note: from python3.8, PATH will not take effect # https://github.com/python/cpython/pull/12302 @@ -298,7 +301,7 @@ if avx_supported(): "WARNING: AVX is supported on local machine, but you have installed " "paddlepaddle without avx core. Hence, no_avx core which has worse " "preformance will be imported.\nYou could reinstall paddlepaddle by " - "'python -m pip install -U paddlepaddle-gpu[==version]' or rebuild " + "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' or rebuild " "paddlepaddle WITH_AVX=ON to get better performance.\n" "The original error is: %s\n" % cpt.get_exception_message(e)) load_noavx = True @@ -350,12 +353,19 @@ if load_noavx: sys.stderr.write( 'Error: Can not import noavx core while this file exists: ' + current_path + os.sep + 'core_noavx.' + core_suffix + '\n') + elif avx_supported(): + sys.stderr.write( + "Error: AVX is support on your machine, but you have installed " + "paddlepaddle without avx core, you should reinstall paddlepaddle by " + "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]\n" + ) else: sys.stderr.write( "Error: AVX is not support on your machine, but you have installed " - "paddlepaddle with avx core, you should reinstall paddlepaddle by " - "'python -m pip install -U paddlepaddle-gpu[==version] -f " - "https://paddlepaddle.org.cn/whl/stable_noavx.html'\n") + "paddlepaddle without no_avx core, you should reinstall paddlepaddle by " + "'python -m pip install --force-reinstall paddlepaddle-gpu[==version] -f " + "https://paddlepaddle.org.cn/whl/mkl/stable/noavx.html or " + "https://paddlepaddle.org.cn/whl/openblas/stable/noavx.html\n") raise e diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt index 81f64038c7..2092151b84 100644 --- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt @@ -1,6 +1,5 @@ -# New custom OP can support Windows/Linux now -if(WITH_GPU OR APPLE) - # GPU custom op tests: compile both .cc and .cu file +# New custom OP can support Windows/Linux/Mac now +if(WITH_GPU OR APPLE) py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py) py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py) py_test(test_custom_relu_model SRCS test_custom_relu_model.py) diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu index 4ec7d08845..38e8e71cf8 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu @@ -45,8 +45,12 @@ std::vector relu_cuda_forward(const paddle::Tensor& x) { int grid = (numel + block - 1) / block; PD_DISPATCH_FLOATING_AND_HALF_TYPES( x.type(), "relu_cuda_forward_kernel", ([&] { + auto cpu_input = x.copy_to(paddle::PlaceType::kCPU); + auto gpu_input = cpu_input.copy_to(paddle::PlaceType::kGPU); relu_cuda_forward_kernel<<>>( - x.data(), out.mutable_data(x.place()), numel); + gpu_input.data(), + out.mutable_data(x.place()), + numel); })); return {out}; diff --git a/python/paddle/fluid/tests/custom_op/test_check_abi.py b/python/paddle/fluid/tests/custom_op/test_check_abi.py index 75cf99458e..baef25d2d1 100644 --- a/python/paddle/fluid/tests/custom_op/test_check_abi.py +++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py @@ -64,14 +64,29 @@ class TestCheckCompiler(TestABIBase): # clear environ self.del_environ() compiler = 'python' # fake wrong compiler - with warnings.catch_warnings(record=True) as error: - flag = utils.check_abi_compatibility(compiler, verbose=True) - # check return False - self.assertFalse(flag) - # check Compiler Compatibility WARNING - self.assertTrue(len(error) == 1) - self.assertTrue( - "Compiler Compatibility WARNING" in str(error[0].message)) + if not utils.IS_WINDOWS: + with warnings.catch_warnings(record=True) as error: + flag = utils.check_abi_compatibility(compiler, verbose=True) + # check return False + self.assertFalse(flag) + # check Compiler Compatibility WARNING + self.assertTrue(len(error) == 1) + self.assertTrue( + "Compiler Compatibility WARNING" in str(error[0].message)) + + def test_exception_windows(self): + # clear environ + self.del_environ() + compiler = 'fake compiler' # fake command + if utils.IS_WINDOWS: + with warnings.catch_warnings(record=True) as error: + flag = utils.check_abi_compatibility(compiler, verbose=True) + # check return False + self.assertFalse(flag) + # check ABI Compatibility WARNING + self.assertTrue(len(error) == 1) + self.assertTrue("Failed to check compiler version for" in + str(error[0].message)) def test_exception_linux(self): # clear environ diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index d8dcc76ac6..0f7ba84ffc 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -105,12 +105,12 @@ class TestJITLoad(unittest.TestCase): in str(e)) if IS_WINDOWS: self.assertTrue( - r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:47" - in str(e)) + r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc" in + str(e)) else: self.assertTrue( - "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:47" - in str(e)) + "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in + str(e)) self.assertTrue(caught_exception) caught_exception = False @@ -126,7 +126,7 @@ class TestJITLoad(unittest.TestCase): "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`" in str(e)) self.assertTrue( - "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:50" in + "python/paddle/fluid/tests/custom_op/custom_relu_op.cu" in str(e)) self.assertTrue(caught_exception) diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index ab528cdb0c..6045ac7d1e 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -26,7 +26,7 @@ from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from -from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath +from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath, _get_fluid_path from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS @@ -69,7 +69,7 @@ def setup(**attr): For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, then the version of user's local machine should satisfy GCC >= 8.2. For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of - PaddlePaddle (Visual Studio 2015 update3). + PaddlePaddle (Visual Studio 2017). If the above conditions are not met, the corresponding warning will be printed, and a fatal error may occur because of ABI compatibility. @@ -79,7 +79,7 @@ def setup(**attr): 2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` . Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking GCC version. - 3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3). + 3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017). Compared with Just-In-Time ``load`` interface, it only compiles once by executing @@ -611,7 +611,7 @@ class BuildExtension(build_ext, object): msg = ( 'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.' 'This may lead to multiple activations of the VC env.' - 'Please set `DISTUTILS_USE_SDK=1` and try again.') + 'Please run `set DISTUTILS_USE_SDK=1` and try again.') raise UserWarning(msg) def _record_op_info(self): @@ -724,7 +724,7 @@ def load(name, processes under a individual subprocess. It does not require CMake or Ninja environment. On Linux platform, it requires GCC compiler whose version is greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows - platform, it requires Visual Studio whose version is greater than 2015 update3. + platform, it requires Visual Studio whose version is greater than 2017. On MacOS, clang++ is requited. In addition, if compiling Operators supporting GPU device, please make sure ``nvcc`` compiler is installed in local environment. @@ -735,7 +735,7 @@ def load(name, For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, then the version of user's local machine should satisfy GCC >= 8.2. For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of - PaddlePaddle (Visual Studio 2015 update3). + PaddlePaddle (Visual Studio 2017). If the above conditions are not met, the corresponding warning will be printed, and a fatal error may occur because of ABI compatibility. @@ -749,7 +749,7 @@ def load(name, 2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` . Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking GCC version. - 3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3). + 3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017). **A simple example:** @@ -802,9 +802,6 @@ def load(name, # ensure to use abs path build_directory = os.path.abspath(build_directory) - # Will load shared library from 'path' on windows - if IS_WINDOWS: - os.environ['path'] = build_directory + ';' + os.environ['path'] log_v("build_directory: {}".format(build_directory), verbose) @@ -827,6 +824,7 @@ def load(name, # write setup.py file and compile it build_base_dir = os.path.join(build_directory, name) + _write_setup_file(name, sources, file_path, build_base_dir, extra_include_paths, extra_cxx_cflags, extra_cuda_cflags, extra_ldflags, verbose) diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index c055084886..ea46ea8b39 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -55,7 +55,7 @@ CLANG_LINK_FLAGS = [ '-dynamiclib', '-undefined', 'dynamic_lookup', '-arch', 'x86_64' ] -MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_custom_op.lib'] +MSVC_LINK_FLAGS = ['/MACHINE:X64'] COMMON_NVCC_FLAGS = ['-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU'] @@ -371,10 +371,11 @@ def _get_core_name(): Return pybind DSO module name. """ import paddle - if paddle.fluid.core.load_noavx: - return 'core_noavx.so' + ext_name = '.pyd' if IS_WINDOWS else '.so' + if not paddle.fluid.core.load_noavx: + return 'core_avx' + ext_name else: - return 'core_avx.so' + return 'core_noavx' + ext_name def _get_lib_core_path(): @@ -386,6 +387,15 @@ def _get_lib_core_path(): return os.path.join(_get_fluid_path(), lib_core_name) +def _get_dll_core_path(): + """ + Return real path of libcore_(no)avx.dylib on Windows. + """ + raw_core_name = _get_core_name() + dll_core_name = "paddle_pybind.dll" + return os.path.join(_get_fluid_path(), dll_core_name) + + def _reset_so_rpath(so_path): """ NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs` @@ -435,9 +445,12 @@ def normalize_extension_kwargs(kwargs, use_cuda=False): # append link flags extra_link_args = kwargs.get('extra_link_args', []) extra_link_args.extend(MSVC_LINK_FLAGS) + lib_core_name = create_sym_link_if_not_exist() + extra_link_args.append('{}'.format(lib_core_name)) if use_cuda: extra_link_args.extend(['cudadevrt.lib', 'cudart_static.lib']) kwargs['extra_link_args'] = extra_link_args + else: ########################### Linux Platform ########################### extra_link_args = kwargs.get('extra_link_args', []) @@ -481,24 +494,41 @@ def create_sym_link_if_not_exist(): """ Create soft symbol link of `core_avx.so` or `core_noavx.so` """ - assert OS_NAME.startswith('darwin') + assert OS_NAME.startswith('darwin') or IS_WINDOWS raw_core_name = _get_core_name() core_path = os.path.join(_get_fluid_path(), raw_core_name) - new_lib_core_path = _get_lib_core_path() + if IS_WINDOWS: + new_dll_core_path = _get_dll_core_path() + # create symbol link on windows + if not os.path.exists(new_dll_core_path): + try: + os.symlink(core_path, new_dll_core_path) + except Exception: + warnings.warn( + "Failed to create soft symbol link for {}.\n You can run prompt as administrator and execute the " + "following command manually: `mklink {} {}`. Now it will create hard link for {} trickly.". + format(raw_core_name, new_dll_core_path, core_path, + raw_core_name)) + run_cmd('mklink /H {} {}'.format(new_dll_core_path, core_path)) + # core_avx or core_noavx with lib suffix + assert os.path.exists(new_dll_core_path) + return raw_core_name[:-4] + ".lib" - # create symbol link - if not os.path.exists(new_lib_core_path): - try: - os.symlink(core_path, new_lib_core_path) - assert os.path.exists(new_lib_core_path) - except Exception: - raise RuntimeError( - "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`". - format(raw_core_name, core_path, new_lib_core_path)) + else: + new_lib_core_path = _get_lib_core_path() + # create symbol link on mac + if not os.path.exists(new_lib_core_path): + try: + os.symlink(core_path, new_lib_core_path) + assert os.path.exists(new_lib_core_path) + except Exception: + raise RuntimeError( + "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`". + format(raw_core_name, core_path, new_lib_core_path)) - # core_avx or core_noavx without suffix - return raw_core_name[:-3] + # core_avx or core_noavx without suffix + return raw_core_name[:-3] def find_cuda_home(): @@ -1054,20 +1084,20 @@ def check_abi_compatibility(compiler, verbose=False): if os.environ.get('PADDLE_SKIP_CHECK_ABI') in ['True', 'true', '1']: return True - which = 'where' if IS_WINDOWS else 'which' - cmd_out = subprocess.check_output( - [which, compiler], stderr=subprocess.STDOUT) - compiler_path = os.path.realpath(cmd_out.decode() - if six.PY3 else cmd_out).strip() - # step 1. if not found any suitable compiler, raise error - if not any(name in compiler_path - for name in _expected_compiler_current_platform()): - warnings.warn( - WRONG_COMPILER_WARNING.format( - user_compiler=compiler, - paddle_compiler=_expected_compiler_current_platform()[0], - platform=OS_NAME)) - return False + if not IS_WINDOWS: + cmd_out = subprocess.check_output( + ['which', compiler], stderr=subprocess.STDOUT) + compiler_path = os.path.realpath(cmd_out.decode() + if six.PY3 else cmd_out).strip() + # if not found any suitable compiler, raise warning + if not any(name in compiler_path + for name in _expected_compiler_current_platform()): + warnings.warn( + WRONG_COMPILER_WARNING.format( + user_compiler=compiler, + paddle_compiler=_expected_compiler_current_platform()[0], + platform=OS_NAME)) + return False version = (0, 0, 0) # clang++ have no ABI compatibility problem diff --git a/python/setup.py.in b/python/setup.py.in index d9ca3038fb..0f2e97192c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -255,11 +255,15 @@ paddle_bins = '' if not '${WIN32}': paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] -package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]} + +if os.name != 'nt': + package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.so']} +else: + package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.pyd', '${FLUID_CORE_NAME}' + '.lib']} + if '${HAS_NOAVX_CORE}' == 'ON': package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')] - package_dir={ '': '${PADDLE_BINARY_DIR}/python', # The paddle.fluid.proto will be generated while compiling. @@ -353,14 +357,6 @@ if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '': package_data['paddle.libs']+=['libxpurt.so'] -### New custom op extension mechanism related ### - -# copy paddle_custom_op.lib/paddle_custom_op.dll to libs on Windows -if os.name == 'nt': - shutil.copy('${PADDLE_CUSTOM_OP_IMPORT_LIB}', libs_path) - shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path) - package_data['paddle.libs'] += ['paddle_custom_op.lib', 'paddle_custom_op.dll'] - # remove unused paddle/libs/__init__.py if os.path.isfile(libs_path+'/__init__.py'): os.remove(libs_path+'/__init__.py') diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index d2969618b8..9d03ae22de 100644 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -436,9 +436,172 @@ CPU_PARALLEL_JOB = [ 'assign_op_test', 'allocator_facade_frac_flags_test', 'aes_cipher_test', + 'test_dist_sparse_tensor_load_adagrad', + 'test_dist_mnist_fp16_allreduce', + 'test_dist_mnist_gradient_merge', + 'test_dist_allreduce_op', + 'test_hdfs3', + 'test_parallel_dygraph_se_resnext', + 'test_dist_fleet_ps9', + 'test_dist_fleet_infer', + 'test_dist_se_resnext_sync', + 'test_dist_oneps', + 'test_dist_sparse_load_ps1', + 'test_dist_mnist_batch_merge', + 'test_dist_fleet_ctr', + 'test_dist_fleet_ps10', + 'test_parallel_dygraph_transformer', + 'test_dist_mnist_fleetapi', + 'test_dist_sparse_tensor_load_adam', + 'test_dist_fleet_ps4', + 'test_dist_fleet_heter_program', + 'test_parallel_dygraph_sparse_embedding_over_height', + 'test_hdfs2', + 'test_dist_sharding_save', + 'test_dist_fleet_ps_gpu_ctr', + 'test_dist_mnist_backward_deps', + 'test_dist_fleet_heter_base', + 'test_dist_sparse_tensor_load_sgd', + 'test_new_group', + 'test_dist_mnist_with_program', + 'test_dist_mnist_pg', + 'test_dist_sparse_tensor_load_rmsprop', + 'test_auto_checkpoint2', + 'test_dist_sparse_tensor_load_ftrl', + 'test_dist_fleet_ps6', + 'test_dist_mnist_fleet_save', + 'test_auto_checkpoint1', + 'test_dist_fleet_a_sync_optimizer_sync', + 'test_dist_fleet_ps3', + 'test_dist_se_resnext_nccl', + 'test_parallel_dygraph_mnist', + 'test_auto_checkpoint_multiple', + 'test_dist_fleet_a_sync_optimizer_auto_async', + 'test_pipeline', + 'test_dist_fleet_ps8', + 'test_dist_fleet_sparse_embedding_ctr', + 'test_dist_se_resnext_dgc', + 'test_dist_fleet_ps7', + 'test_dist_fleet_decay', + 'test_dist_fleet_a_sync_optimizer_auto_geo', + 'test_dist_fleet_geo', + 'test_parallel_dygraph_dataparallel', + 'test_hdfs1', + 'test_dist_mnist_dgc_nccl', + 'test_dist_fleet_ctr2', + 'test_parallel_dygraph_unused_variables', + 'test_dist_mnist_multi_comm', + 'test_dist_sparse_tensor_load_momentum', + 'test_gen_nccl_id_op', + 'test_parallel_dygraph_sparse_embedding', + 'test_dist_mnist_ring_allreduce', + 'test_fleet_launch_async', + 'test_dist_fleet_a_sync_optimizer_geo', + 'test_parallel_dygraph_control_flow', + 'test_auto_checkpoint', + 'test_fleet_pipeline_meta_optimizer', + 'test_dist_fleet_heter_ctr', + 'test_fleet_graph_execution_meta_optimizer', + 'test_fleet_run_random_port', + 'test_dist_fleet_ps5', + 'test_dist_fleet_a_sync_optimizer_auto', + 'test_dist_lookup_sparse_table_fuse_ops', + 'test_dist_fleet_a_sync_optimizer_async', + 'test_c_comm_init_op', + 'test_fleet_launch_nproc', + 'test_dist_fleet_simnet', + 'test_auto_checkpoint_dist_basic', + 'test_fleet_launch_cloud', + 'test_dist_fleet_ps', + 'test_dist_op', + 'test_dist_sparse_load_ps0', + 'test_auto_checkpoint3', + 'test_dist_fleet_ps2', + 'test_dist_fleet_grad_clip', + 'test_custom_concat', + 'test_analyzer_transformer_fuse', + 'test_analyzer_seq_pool1_fuse_statis', + 'test_fc_lstm_fuse_pass_cc', + 'test_layer_norm_fuse_pass', + 'test_fc_gru_fuse_pass_cc', + 'test_analyzer_save_model', + 'test_fleet_ps', + 'test_analyzer_multi_model_prediction', + 'test_fleet_base_3', + 'test_fleet_base_2', + 'test_ascend_trigger', + 'test_fleet_amp_meta_optimizer', + 'test_fleetrun', + 'test_check_abi', + 'dense_table_test', + 'test_adaptive_pool2d_convert_global_pass', + 'test_fleet_recompute_meta_optimizer', + 'test_fleet_fp16_allreduce_meta_optimizer', + 'test_post_training_quantization_lstm_model', + 'test_fleet_metric', + 'test_fleet_gradient_merge_meta_optimizer', + 'test_fleet_sharding_meta_optimizer', + 'test_listen_and_serv_op', + 'test_analyzer_zerocopytensor_tensor', + 'test_conv_bn_fuse_pass_cc', + 'test_collective_optimizer', + 'test_bf16_utils', + 'test_analyzer_seq_pool1_compare_determine', + 'test_avoid_twice_initialization', + 'test_callback_early_stop', + 'test_fleet_distributed_strategy', + 'test_launch_coverage', + 'test_sgd_op_bf16', + 'test_model_cast_to_bf16', + 'test_hybrid_parallel_topology', + 'barrier_table_test', + 'test_check_error', + 'test_fleet_lamb_meta_optimizer', + 'test_fleet_rolemaker_2', + 'test_distributed_strategy', + 'test_rnn_cudnn_params_packing', + 'test_communicator_async', + 'brpc_utils_test', + 'test_analyzer_capi_pd_tensor', + 'test_recv_save_op', + 'heter_listen_and_server_test', + 'test_analyzer_capi_ner', + 'test_unsqueeze2_eltwise_fuse_pass', + 'test_dgc_optimizer', + 'test_fleet_cc', + 'test_repeated_fc_relu_fuse_pass_cc', + 'heter_server_test', + 'test_static_save_load_large', + 'graph_node_test', + 'test_custom_conj', + 'test_fleet_private_function', + 'test_fake_init_op', + 'brpc_service_sparse_sgd_test', + 'test_tf32_cudnn', + 'test_communicator_geo', + 'test_dispatch_jit', + 'test_layer_norm_fuse_pass_cc', + 'test_fleet_dgc_meta_optimizer', + 'test_fc_fuse_pass_cc', + 'test_communicator_sync', + 'test_analyzer_capi', + 'test_fleet_lars_meta_optimizer', + 'test_communicator_half_async', + 'test_fleet_localsgd_meta_optimizer', + 'test_fleet_amp_init', + 'test_fleet_checkpoint', + 'test_analyzer_seq_pool1_fuse_compare_zero_copy', + 'test_lookup_table_bf16_op', + 'test_fleet_meta_optimizer_base', + 'table_test', + 'test_fleet_rolemaker_new', + 'test_fleet_graph_executor', + 'test_multi_out_jit', + 'test_fleet_utils', + 'brpc_service_dense_sgd_test', ] -# It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, +# It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, # just remove it from this list. TETRAD_PARALLEL_JOB = [ 'buffered_allocator_test', @@ -477,9 +640,53 @@ TETRAD_PARALLEL_JOB = [ 'tensor_test', 'test_repeated_fc_relu_fuse_pass_cc', 'test_mkldnn_caching', + 'test_analyzer_seq_pool1', + 'test_analyzer_ocr', + 'test_analyzer_seq_conv1', + 'test_analyzer_small_dam', + 'test_analyzer_mobilenet_depthwise_conv', + 'test_analyzer_pyramid_dnn', + 'test_analyzer_text_classification', + 'test_analyzer_rnn2', + 'test_analyzer_transformer', + 'test_analyzer_resnet50', + 'test_analyzer_ner', + 'test_analyzer_lac', + 'test_analyzer_transformer_profile', + 'test_analyzer_mobilenet_transpose', + 'test_analyzer_rnn1', + 'test_analyzer_seq_pool1_profile', + 'test_analyzer_paddletensor_tensor', + 'test_analyzer_bert', + 'test_analyzer_googlenet', + 'zero_copy_tensor_test', + 'custom_tensor_test', + 'test_fleet_base', + 'test_imperative_container_layerdict', + 'test_complex_simplenet', + 'test_tensor_register_hook', + 'test_set_value_op', + 'test_tensor_type_promotion', + 'test_view_op_reuse_allocation', + 'test_complex_grad_accumulated', + 'test_sequential', + 'test_sequential', + 'test_imperative_layers', + 'test_dgc_momentum_op', + 'test_memcpy_op', + 'test_dgc_op', + 'test_modelaverage', + 'test_lookahead', + 'test_callback_visualdl', + 'test_new_group_api', + 'test_collective_split_embedding_none_divisible', + 'test_collective_wait', + 'test_collective_split_row_linear', + 'test_collective_split_col_linear', + 'test_collective_split_embedding', ] -# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, +# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, # just remove it from this list. TWO_PARALLEL_JOB = [ 'convert_model2dot_ernie', @@ -611,7 +818,6 @@ TWO_PARALLEL_JOB = [ 'test_adam_op_multi_thread', 'test_adamax_op', 'test_while_loop_op', - 'test_affine_grid_function', 'test_transpose_flatten_concat_fuse_pass', 'test_trace_op', 'test_backward', @@ -663,7 +869,6 @@ TWO_PARALLEL_JOB = [ 'test_gather_op', 'test_partial_concat_op', 'test_gaussian_random_op', - 'test_paddle_imperative_double_grad', 'test_generate_proposals_v2_op', 'test_pad_constant_like', 'test_grid_sample_function', @@ -879,6 +1084,11 @@ TWO_PARALLEL_JOB = [ 'test_imperative_load_static_param', 'test_fuse_bn_add_act_pass', 'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass', + 'test_quantize_transpiler_v2', + 'paddle_infer_api_test', + 'test_analyzer_ernie', + 'lite_resnet50_test', + 'lite_mul_model_test', ] -- GitLab