From 084e70cdaea91c6a95b0f28e5e06f6aeeec7a395 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 10 Jul 2020 11:28:59 +0800 Subject: [PATCH] Attempt to resolve the TLS problem (#25390) (#25469) * attempt to resolve tls problem, test=develop * add glibc version check, test=develop * fix regex, test=develop * refine get_libc_ver, test=develop * refine get_libc_ver, test=develop --- cmake/external/mkldnn.cmake | 18 ---------- python/paddle/fluid/core.py | 69 ++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 98a14c646ed..5ce77a72f24 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -36,28 +36,12 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. -IF(${CBLAS_PROVIDER} STREQUAL "MKLML") - SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) - MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") -ELSE() - MESSAGE(STATUS "Build MKLDNN without MKLML") -ENDIF() IF(NOT WIN32) SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") - - IF(${CBLAS_PROVIDER} STREQUAL "MKLML") - # Force libmkldnn.so to link libiomp5.so (provided by intel mkl) instead of libgomp.so (provided by gcc), - # since core_avx.so links libiomp5.so - set(MKLDNN_SHARED_LINKER_FLAG "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed -L${MKLML_LIB_DIR} -liomp5") - set(FORBID "-fopenmp") - ELSE() - set(MKLDNN_SHARED_LINKER_FLAG "${CMAKE_SHARED_LINKER_FLAGS}") - set(FORBID "") - ENDIF() ELSE() SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") ENDIF(NOT WIN32) @@ -91,8 +75,6 @@ ExternalProject_Add( -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF - -DCMAKE_SHARED_LINKER_FLAGS=${MKLDNN_SHARED_LINKER_FLAG} - -DCMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS=${FORBID} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} ) if(WIN32) diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index c3fbb7b51b5..d7d0c68a314 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -17,6 +17,8 @@ from __future__ import print_function import site import sys import os +import warnings +import platform core_suffix = 'so' if os.name == 'nt': @@ -62,7 +64,6 @@ def avx_supported(): """ Whether current system(Linux, MacOS, Windows) is supported with AVX. """ - import platform from .. import compat as cpt sysstr = platform.system().lower() has_avx = False @@ -160,6 +161,72 @@ def avx_supported(): return False +def run_shell_command(cmd): + import subprocess + out, err = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + shell=True).communicate() + if err: + return None + else: + return out.decode('utf-8') + + +def get_dso_path(core_so, dso_name): + if core_so and dso_name: + return run_shell_command("ldd %s|grep %s|awk '{print $3}'" % + (core_so, dso_name)).strip() + else: + return None + + +def load_dso(dso_absolute_path): + if dso_absolute_path: + try: + from ctypes import cdll + cdll.LoadLibrary(dso_absolute_path) + except: + warnings.warn("Load {} failed".format(dso_absolute_path)) + + +def pre_load(dso_name): + if has_avx_core: + core_so = current_path + os.sep + 'core_avx.' + core_suffix + elif has_noavx_core: + core_so = current_path + os.sep + 'core_noavx.' + core_suffix + else: + core_so = None + dso_path = get_dso_path(core_so, dso_name) + load_dso(dso_path) + + +def get_glibc_ver(): + return run_shell_command("ldd --version | awk '/ldd/{print $NF}'").strip() + + +def less_than_ver(a, b): + import re + import operator + + def to_list(s): + s = re.sub('(\.0+)+$', '', s) + return [int(x) for x in s.split('.')] + + return operator.lt(to_list(a), to_list(b)) + + +# NOTE(zhiqiu): An error may occurs when import paddle in linux platform with glibc < 2.22, +# the error message of which is "dlopen: cannot load any more object with static TLS". +# This happens when: +# (1) the number of dynamic shared librarys (DSO) loaded > 14, +# (2) after that, load a dynamic shared library (DSO) with static TLS. +# For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs. +# So, here is a tricky way to solve the problem by pre load 'libgomp' before 'core_avx.so'. +# The final solution is to upgrade glibc to > 2.22 on the target system. +if platform.system().lower() == 'linux' and less_than_ver(get_glibc_ver(), + '2.23'): + pre_load('libgomp') + load_noavx = False if avx_supported(): -- GitLab