From 5b8fe87faf2a73c27ac85905fa995a9db0da2b71 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 2 May 2017 13:51:22 +0800 Subject: [PATCH] dlopen lapacke api and remove gfotran --- cmake/external/openblas.cmake | 29 +-------- paddle/cuda/CMakeLists.txt | 6 +- paddle/cuda/src/hl_cuda_cublas.cc | 2 +- paddle/cuda/src/hl_cuda_cudnn.cc | 2 +- paddle/cuda/src/hl_cuda_device.cc | 28 ++++----- paddle/cuda/src/hl_warpctc_wrap.cc | 2 +- paddle/math/MathFunctions.cpp | 59 ++++++++++++++++--- .../hl_dso_loader.cc => utils/DynamicLoad.cc} | 14 ++++- .../hl_dso_loader.h => utils/DynamicLoad.h} | 15 +++-- 9 files changed, 93 insertions(+), 64 deletions(-) rename paddle/{cuda/src/hl_dso_loader.cc => utils/DynamicLoad.cc} (94%) rename paddle/{cuda/include/hl_dso_loader.h => utils/DynamicLoad.h} (84%) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 92ea23c7633..317a1a92043 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -27,33 +27,6 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE) ENDIF(WIN32) - IF(CMAKE_COMPILER_IS_GNUCC) - ENABLE_LANGUAGE(Fortran) - if (NOT CMAKE_Fortran_COMPILER_VERSION) - # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly. - execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion - OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION) - endif() - string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION}) - list(GET Fortran_VERSION 0 Fortran_MAJOR) - list(GET Fortran_VERSION 1 Fortran_MINOR) - find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS - /lib - /usr/lib - /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/ - /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/) - if (NOT GFORTRAN_LIBRARY) - message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas") - endif() - find_package(Threads REQUIRED) - LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) - ENDIF(CMAKE_COMPILER_IS_GNUCC) - - IF(NOT CMAKE_Fortran_COMPILER) - MESSAGE(FATAL_ERROR "To build lapack in libopenblas, " - "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...") - ENDIF(NOT CMAKE_Fortran_COMPILER) - ADD_DEFINITIONS(-DPADDLE_USE_LAPACK) ExternalProject_Add( @@ -64,7 +37,7 @@ IF(NOT ${CBLAS_FOUND}) PREFIX ${CBLAS_SOURCES_DIR} INSTALL_DIR ${CBLAS_INSTALL_DIR} BUILD_IN_SOURCE 1 - BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib + BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_LAPACK=1 DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX= UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt index a28ccd6f07c..f9061e96deb 100755 --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -21,16 +21,13 @@ set(CUDA_CXX_WITH_GPU_SOURCES if(WITH_GPU) set(CUDA_CXX_SOURCES - src/hl_dso_loader.cc src/hl_warpctc_wrap.cc ${CUDA_CXX_WITH_GPU_SOURCES}) set_source_files_properties(${CUDA_CXX_SOURCES} PROPERTIES COMPILE_FLAGS "-D__NVCC__") else() - set(CUDA_CXX_SOURCES - src/hl_dso_loader.cc - src/hl_warpctc_wrap.cc) + set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc) endif() set(CUDA_CU_SOURCES @@ -47,7 +44,6 @@ set(CUDA_CU_SOURCES set(CUDA_HEADERS include/hl_time.h - include/hl_dso_loader.h include/hl_warpctc_wrap.h include/hl_sequence.h include/hl_cuda_cublas.h diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc index 182e8ab218c..54c6d60c16e 100644 --- a/paddle/cuda/src/hl_cuda_cublas.cc +++ b/paddle/cuda/src/hl_cuda_cublas.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include #include #include "hl_cuda.h" -#include "hl_dso_loader.h" #include "hl_thread.ph" +#include "paddle/utils/DynamicLoad.h" #include "paddle/utils/Logging.h" namespace dynload { diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc index 6198f067bab..4de6a863543 100644 --- a/paddle/cuda/src/hl_cuda_cudnn.cc +++ b/paddle/cuda/src/hl_cuda_cudnn.cc @@ -17,8 +17,8 @@ limitations under the License. */ #include #include #include "hl_cuda_cudnn.ph" -#include "hl_dso_loader.h" #include "hl_thread.ph" +#include "paddle/utils/DynamicLoad.h" #include "paddle/utils/Logging.h" DEFINE_int32(cudnn_conv_workspace_limit_in_mb, diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc index 6dfb12e00b8..e7a8d563908 100644 --- a/paddle/cuda/src/hl_cuda_device.cc +++ b/paddle/cuda/src/hl_cuda_device.cc @@ -24,8 +24,8 @@ limitations under the License. */ #include #include "hl_cuda.ph" #include "hl_thread.ph" -#include "hl_dso_loader.h" #include "paddle/utils/Logging.h" +#include "paddle/utils/DynamicLoad.h" // clang-format on namespace dynload { @@ -98,11 +98,11 @@ int g_cuda_lib_version = 0; * Check build-in cuda function using glog and it **does not** * support << operator for more details error info. */ -#define CHECK_CUDA(cudaFunc) \ - do { \ - cudaError_t cudaStat = cudaFunc; \ - CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \ - << cudaGetErrorString(cudaStat); \ +#define CHECK_CUDA(cudaFunc) \ + do { \ + cudaError_t cudaStat = cudaFunc; \ + CHECK_EQ(cudaSuccess, cudaStat) \ + << "Cuda Error: " << cudaGetErrorString(cudaStat); \ } while (0) /** @@ -469,8 +469,8 @@ void hl_specify_devices_start(int *device, int number) { CHECK(tmp) << "[Start failed] System memory is not enough."; g_device = (hl_device_prop *)tmp; - device_prop = (hl_device_prop)( - (char *)tmp + g_system_device_num * sizeof(hl_device_prop *)); + device_prop = (hl_device_prop)((char *)tmp + g_system_device_num * + sizeof(hl_device_prop *)); memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *)); int num = 0; for (int i = 0; i < number; i++) { @@ -559,8 +559,8 @@ bool hl_get_sync_flag() { return g_sync_flag; } void hl_stream_synchronize(hl_stream_t stream) { cudaStream_t cu_stream; - CHECK_LT(stream, HPPL_STREAM_END) << __func__ - << ": the parameter stream is error."; + CHECK_LT(stream, HPPL_STREAM_END) + << __func__ << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; CHECK_CUDA(cudaStreamSynchronize(cu_stream)); @@ -590,8 +590,8 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) { cudaStream_t cu_stream; CHECK_NOTNULL(event); - CHECK_LT(stream, HPPL_STREAM_END) << __func__ - << ": the parameter stream is error."; + CHECK_LT(stream, HPPL_STREAM_END) + << __func__ << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream)); @@ -601,8 +601,8 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) { cudaStream_t cu_stream; CHECK_NOTNULL(event); - CHECK_LT(stream, HPPL_STREAM_END) << __func__ - << ": the parameter stream is error."; + CHECK_LT(stream, HPPL_STREAM_END) + << __func__ << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0)); diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc index f57efb2b467..5a4de24ced0 100644 --- a/paddle/cuda/src/hl_warpctc_wrap.cc +++ b/paddle/cuda/src/hl_warpctc_wrap.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "hl_warpctc_wrap.h" #include -#include "hl_dso_loader.h" +#include "paddle/utils/DynamicLoad.h" #include "paddle/utils/Logging.h" namespace dynload { diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index 6203cd3b9ab..895ae104bef 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -15,6 +15,49 @@ limitations under the License. */ #include "MathFunctions.h" #include "hl_matrix_apply.cuh" #include "hl_matrix_ops.cuh" +#include "paddle/utils/DynamicLoad.h" + +namespace dynload { + +std::once_flag lapack_dso_flag; +void* lapack_dso_handle = nullptr; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load lapack routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + int operator()(Args... args)->decltype(__name(args...)) { \ + using lapack_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \ + void* p_##__name = dlsym(lapack_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + } __name; // struct DynLoad__##__name + +// clang-format off +#ifdef PADDLE_USE_LAPACK +#ifdef PADDLE_USE_ATLAS + #define LAPACK_ROUTINE_EACH(__macro) \ + __macro(clapack_sgetrf) \ + __macro(clapack_dgetrf) \ + __macro(clapack_sgetri) \ + __macro(clapack_dgetri) +#else + #define LAPACK_ROUTINE_EACH(__macro) \ + __macro(LAPACKE_sgetrf) \ + __macro(LAPACKE_dgetrf) \ + __macro(LAPACKE_sgetri) \ + __macro(LAPACKE_dgetri) +#endif +#endif +// clang-format on +} // namespace dynload namespace paddle { @@ -87,9 +130,9 @@ int getrf(const CBLAS_ORDER order, int* ipiv) { #ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS - return clapack_sgetrf(order, M, N, A, lda, ipiv); + return dynload::clapack_sgetrf(order, M, N, A, lda, ipiv); #else - return LAPACKE_sgetrf(order, M, N, A, lda, ipiv); + return dynload::LAPACKE_sgetrf(order, M, N, A, lda, ipiv); #endif #else LOG(FATAL) << "Not implemented"; @@ -106,9 +149,9 @@ int getrf(const CBLAS_ORDER order, int* ipiv) { #ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS - return clapack_dgetrf(order, M, N, A, lda, ipiv); + return dynload::clapack_dgetrf(order, M, N, A, lda, ipiv); #else - return LAPACKE_dgetrf(order, M, N, A, lda, ipiv); + return dynload::LAPACKE_dgetrf(order, M, N, A, lda, ipiv); #endif #else LOG(FATAL) << "Not implemented"; @@ -124,9 +167,9 @@ int getri(const CBLAS_ORDER order, const int* ipiv) { #ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS - return clapack_sgetri(order, N, A, lda, ipiv); + return dynload::clapack_sgetri(order, N, A, lda, ipiv); #else - return LAPACKE_sgetri(order, N, A, lda, ipiv); + return dynload::LAPACKE_sgetri(order, N, A, lda, ipiv); #endif #else LOG(FATAL) << "Not implemented"; @@ -142,9 +185,9 @@ int getri(const CBLAS_ORDER order, const int* ipiv) { #ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS - return clapack_dgetri(order, N, A, lda, ipiv); + return dynload::clapack_dgetri(order, N, A, lda, ipiv); #else - return LAPACKE_dgetri(order, N, A, lda, ipiv); + return dynload::LAPACKE_dgetri(order, N, A, lda, ipiv); #endif #else LOG(FATAL) << "Not implemented"; diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/utils/DynamicLoad.cc similarity index 94% rename from paddle/cuda/src/hl_dso_loader.cc rename to paddle/utils/DynamicLoad.cc index 53164dd27c7..8f0532942e7 100644 --- a/paddle/cuda/src/hl_dso_loader.cc +++ b/paddle/utils/DynamicLoad.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hl_dso_loader.h" +#include "DynamicLoad.h" +#include "Logging.h" #include -#include "paddle/utils/Logging.h" DEFINE_string(cudnn_dir, "", @@ -30,6 +30,8 @@ DEFINE_string(cuda_dir, DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); +DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -160,3 +162,11 @@ void GetWarpCTCDsoHandle(void** dso_handle) { GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle); #endif } + +void GetLapackDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "liblapack.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "liblapack.so", dso_handle); +#endif +} diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/utils/DynamicLoad.h similarity index 84% rename from paddle/cuda/include/hl_dso_loader.h rename to paddle/utils/DynamicLoad.h index 276a07d3c73..5587993f874 100644 --- a/paddle/cuda/include/hl_dso_loader.h +++ b/paddle/utils/DynamicLoad.h @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef HL_DSO_LOADER_H_ -#define HL_DSO_LOADER_H_ +#ifndef HL_DYNAMIC_LOAD_H_ +#define HL_DYNAMIC_LOAD_H_ #include #include #include -#include "hl_base.h" /** * @brief load the DSO of CUBLAS @@ -52,4 +51,12 @@ void GetCurandDsoHandle(void** dso_handle); */ void GetWarpCTCDsoHandle(void** dso_handle); -#endif // HL_DSO_LOADER_H_ +/** + * @brief load the DSO of lapack + * + * @param **dso_handle dso handler + * + */ +void GetLapackDsoHandle(void** dso_handle); + +#endif // HL_DYNAMIC_LOAD_H_ -- GitLab