dlopen lapacke api and remove gfotran

5b8fe87f · liaogang · 896b9c55 · 5b8fe87f · 5b8fe87f · 5b8fe87f
9 changed file
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -27,33 +27,6 @@ IF(NOT ${CBLAS_FOUND})
        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
    ENDIF(WIN32)

-    IF(CMAKE_COMPILER_IS_GNUCC)
-        ENABLE_LANGUAGE(Fortran)
-        if (NOT CMAKE_Fortran_COMPILER_VERSION)
-          # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
-          execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
-                    OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
-        endif()
-        string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
-        list(GET Fortran_VERSION 0 Fortran_MAJOR)
-        list(GET Fortran_VERSION 1 Fortran_MINOR)
-        find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS 
-                     /lib
-                     /usr/lib
-                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
-                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
-        if (NOT GFORTRAN_LIBRARY)
-            message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
-        endif()
-        find_package(Threads REQUIRED)
-        LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
-    ENDIF(CMAKE_COMPILER_IS_GNUCC)
-
-    IF(NOT CMAKE_Fortran_COMPILER)
-        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
-                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
-    ENDIF(NOT CMAKE_Fortran_COMPILER)
-
    ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)

    ExternalProject_Add(
@@ -64,7 +37,7 @@ IF(NOT ${CBLAS_FOUND})
        PREFIX              ${CBLAS_SOURCES_DIR}
        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
        BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_LAPACK=1 DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -21,16 +21,13 @@ set(CUDA_CXX_WITH_GPU_SOURCES

 if(WITH_GPU)
    set(CUDA_CXX_SOURCES
-        src/hl_dso_loader.cc
        src/hl_warpctc_wrap.cc
        ${CUDA_CXX_WITH_GPU_SOURCES})

    set_source_files_properties(${CUDA_CXX_SOURCES}
                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
-    set(CUDA_CXX_SOURCES
-        src/hl_dso_loader.cc
-        src/hl_warpctc_wrap.cc)
+    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
 endif()

 set(CUDA_CU_SOURCES
@@ -47,7 +44,6 @@ set(CUDA_CU_SOURCES

 set(CUDA_HEADERS
    include/hl_time.h
-    include/hl_dso_loader.h
    include/hl_warpctc_wrap.h
    include/hl_sequence.h
    include/hl_cuda_cublas.h

--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <sys/time.h>
 #include <mutex>
 #include "hl_cuda.h"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "paddle/utils/DynamicLoad.h"
 #include "paddle/utils/Logging.h"

 namespace dynload {

--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <mutex>
 #include "hl_cuda_cudnn.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "paddle/utils/DynamicLoad.h"
 #include "paddle/utils/Logging.h"

 DEFINE_int32(cudnn_conv_workspace_limit_in_mb,

--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -24,8 +24,8 @@ limitations under the License. */
 #include <mutex>
 #include "hl_cuda.ph"
 #include "hl_thread.ph"
-#include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/DynamicLoad.h"
 // clang-format on

 namespace dynload {
@@ -98,11 +98,11 @@ int g_cuda_lib_version = 0;
 * Check build-in cuda function using glog and it **does not**
 * support << operator for more details error info.
 */
-#define CHECK_CUDA(cudaFunc)                                         \
-  do {                                                               \
-    cudaError_t cudaStat = cudaFunc;                                 \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
-                                    << cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc)                               \
+  do {                                                     \
+    cudaError_t cudaStat = cudaFunc;                       \
+    CHECK_EQ(cudaSuccess, cudaStat)                        \
+        << "Cuda Error: " << cudaGetErrorString(cudaStat); \
  } while (0)

 /**
@@ -469,8 +469,8 @@ void hl_specify_devices_start(int *device, int number) {
  CHECK(tmp) << "[Start failed] System memory is not enough.";

  g_device = (hl_device_prop *)tmp;
-  device_prop = (hl_device_prop)(
-      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+  device_prop = (hl_device_prop)((char *)tmp + g_system_device_num *
+                                                   sizeof(hl_device_prop *));
  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
  int num = 0;
  for (int i = 0; i < number; i++) {
@@ -559,8 +559,8 @@ bool hl_get_sync_flag() { return g_sync_flag; }
 void hl_stream_synchronize(hl_stream_t stream) {
  cudaStream_t cu_stream;

-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END)
+      << __func__ << ": the parameter stream is error.";

  cu_stream = t_resource.stream[stream];
  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
@@ -590,8 +590,8 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
  cudaStream_t cu_stream;

  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END)
+      << __func__ << ": the parameter stream is error.";

  cu_stream = t_resource.stream[stream];
  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
@@ -601,8 +601,8 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
  cudaStream_t cu_stream;

  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END)
+      << __func__ << ": the parameter stream is error.";

  cu_stream = t_resource.stream[stream];
  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));

--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "hl_warpctc_wrap.h"
 #include <mutex>
-#include "hl_dso_loader.h"
+#include "paddle/utils/DynamicLoad.h"
 #include "paddle/utils/Logging.h"

 namespace dynload {

--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -15,6 +15,49 @@ limitations under the License. */
 #include "MathFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_ops.cuh"
+#include "paddle/utils/DynamicLoad.h"
+
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    int operator()(Args... args)->decltype(__name(args...)) {                  \
+      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
+      void* p_##__name = dlsym(lapack_dso_handle, #__name);                    \
+      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+
+// clang-format off
+#ifdef PADDLE_USE_LAPACK
+#ifdef PADDLE_USE_ATLAS
+  #define LAPACK_ROUTINE_EACH(__macro)        \
+    __macro(clapack_sgetrf)                   \
+    __macro(clapack_dgetrf)                   \
+    __macro(clapack_sgetri)                   \
+    __macro(clapack_dgetri)          
+#else
+  #define LAPACK_ROUTINE_EACH(__macro)        \
+    __macro(LAPACKE_sgetrf)                   \
+    __macro(LAPACKE_dgetrf)                   \
+    __macro(LAPACKE_sgetri)                   \
+    __macro(LAPACKE_dgetri)          
+#endif
+#endif
+// clang-format on
+}  // namespace dynload

 namespace paddle {

@@ -87,9 +130,9 @@ int getrf<float>(const CBLAS_ORDER order,
                 int* ipiv) {
 #ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
-  return clapack_sgetrf(order, M, N, A, lda, ipiv);
+  return dynload::clapack_sgetrf(order, M, N, A, lda, ipiv);
 #else
-  return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
+  return dynload::LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
 #endif
 #else
  LOG(FATAL) << "Not implemented";
@@ -106,9 +149,9 @@ int getrf<double>(const CBLAS_ORDER order,
                  int* ipiv) {
 #ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
-  return clapack_dgetrf(order, M, N, A, lda, ipiv);
+  return dynload::clapack_dgetrf(order, M, N, A, lda, ipiv);
 #else
-  return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
+  return dynload::LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
 #endif
 #else
  LOG(FATAL) << "Not implemented";
@@ -124,9 +167,9 @@ int getri<float>(const CBLAS_ORDER order,
                 const int* ipiv) {
 #ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
-  return clapack_sgetri(order, N, A, lda, ipiv);
+  return dynload::clapack_sgetri(order, N, A, lda, ipiv);
 #else
-  return LAPACKE_sgetri(order, N, A, lda, ipiv);
+  return dynload::LAPACKE_sgetri(order, N, A, lda, ipiv);
 #endif
 #else
  LOG(FATAL) << "Not implemented";
@@ -142,9 +185,9 @@ int getri<double>(const CBLAS_ORDER order,
                  const int* ipiv) {
 #ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
-  return clapack_dgetri(order, N, A, lda, ipiv);
+  return dynload::clapack_dgetri(order, N, A, lda, ipiv);
 #else
-  return LAPACKE_dgetri(order, N, A, lda, ipiv);
+  return dynload::LAPACKE_dgetri(order, N, A, lda, ipiv);
 #endif
 #else
  LOG(FATAL) << "Not implemented";

--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "hl_dso_loader.h"
+#include "DynamicLoad.h"
+#include "Logging.h"
 #include <gflags/gflags.h>
-#include "paddle/utils/Logging.h"

 DEFINE_string(cudnn_dir,
              "",
@@ -30,6 +30,8 @@ DEFINE_string(cuda_dir,

 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");

+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
 static inline std::string join(const std::string& part1,
                               const std::string& part2) {
  // directory separator
@@ -160,3 +162,11 @@ void GetWarpCTCDsoHandle(void** dso_handle) {
  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
 #endif
 }
+
+void GetLapackDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "liblapack.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "liblapack.so", dso_handle);
+#endif
+}
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifndef HL_DSO_LOADER_H_
-#define HL_DSO_LOADER_H_
+#ifndef HL_DYNAMIC_LOAD_H_
+#define HL_DYNAMIC_LOAD_H_

 #include <dlfcn.h>
 #include <memory>
 #include <string>
-#include "hl_base.h"

 /**
 * @brief    load the DSO of CUBLAS
@@ -52,4 +51,12 @@ void GetCurandDsoHandle(void** dso_handle);
 */
 void GetWarpCTCDsoHandle(void** dso_handle);

-#endif  // HL_DSO_LOADER_H_
+/**
+ * @brief    load the DSO of lapack
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetLapackDsoHandle(void** dso_handle);
+
+#endif  // HL_DYNAMIC_LOAD_H_