enable dynamic load mklml lib on fluid

f503f129 · tensor-tang · 25241e9e · f503f129 · f503f129 · f503f129
10 changed file
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
 ADD_LIBRARY(cblas STATIC ${dummyfile})
-TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+  TARGET_LINK_LIBRARIES(cblas dynload_mklml)
+ELSE()
+  TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
 IF(NOT ${CBLAS_FOUND})
    ADD_DEPENDENCIES(cblas extern_openblas)

--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/operators/math/blas.h"
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_service.h>
 #include <omp.h>
 #endif
@@ -164,7 +164,7 @@ TEST(inference, nlp) {
  // only use 1 thread number per std::thread
  omp_set_dynamic(0);
  omp_set_num_threads(1);
-  mkl_set_num_threads(1);
+  paddle::operators::math::SetNumThreads(1);
 #endif
  double start_ms = 0, stop_ms = 0;

--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -18,10 +18,7 @@
 #include "paddle/fluid/framework/tensor.h"
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
+#include "paddle/fluid/platform/dynload/mklml.h"
-#include <mkl_lapacke.h>
-#include <mkl_service.h>
-#include <mkl_vml_functions.h>
 #endif
 #ifdef PADDLE_USE_OPENBLAS
@@ -55,7 +52,7 @@ static void SetNumThreads(int num_threads) {
  openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)
  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  mkl_set_num_threads(real_num_threads);
+  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
 #else
  PADDLE_ENFORCE(false, "To be implemented.");
 #endif

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -22,61 +22,109 @@ namespace math {
 template <typename T>
 struct CBlas;
+#ifdef PADDLE_WITH_MKLML
 template <>
 struct CBlas<float> {
  template <typename... ARGS>
  static void GEMM(ARGS... args) {
-    cblas_sgemm(args...);
+    platform::dynload::cblas_sgemm(args...);
  }
  template <typename... ARGS>
  static void AXPY(ARGS... args) {
-    cblas_saxpy(args...);
+    platform::dynload::cblas_saxpy(args...);
+  }
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    platform::dynload::cblas_scopy(args...);
+  }
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    platform::dynload::cblas_sgemv(args...);
+  }
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+    platform::dynload::cblas_sgemm_batch(args...);
  }
-#ifdef PADDLE_WITH_MKLML
  template <typename... ARGS>
  static void VADD(ARGS... args) {
-    vsAdd(args...);
+    platform::dynload::vsAdd(args...);
+  }
+};
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    platform::dynload::cblas_dgemm(args...);
+  }
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    platform::dynload::cblas_daxpy(args...);
  }
-#endif
  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
-    cblas_scopy(args...);
+    platform::dynload::cblas_dcopy(args...);
  }
  template <typename... ARGS>
  static void GEMV(ARGS... args) {
-    cblas_sgemv(args...);
+    platform::dynload::cblas_dgemv(args...);
  }
-#ifdef PADDLE_WITH_MKLML
  template <typename... ARGS>
  static void GEMM_BATCH(ARGS... args) {
-    cblas_sgemm_batch(args...);
+    platform::dynload::cblas_dgemm_batch(args...);
+  }
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    platform::dynload::vdAdd(args...);
  }
-#endif
 };
+#else
 template <>
-struct CBlas<double> {
+struct CBlas<float> {
  template <typename... ARGS>
  static void GEMM(ARGS... args) {
-    cblas_dgemm(args...);
+    cblas_sgemm(args...);
  }
  template <typename... ARGS>
  static void AXPY(ARGS... args) {
-    cblas_daxpy(args...);
+    cblas_saxpy(args...);
  }
-#ifdef PADDLE_WITH_MKLML
  template <typename... ARGS>
-  static void VADD(ARGS... args) {
+  static void VCOPY(ARGS... args) {
-    vdAdd(args...);
+    cblas_scopy(args...);
+  }
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    cblas_sgemv(args...);
+  }
+};
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_dgemm(args...);
+  }
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    cblas_daxpy(args...);
  }
-#endif
  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
@@ -87,15 +135,8 @@ struct CBlas<double> {
  static void GEMV(ARGS... args) {
    cblas_dgemv(args...);
  }
-#ifdef PADDLE_WITH_MKLML
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    cblas_dgemm_batch(args...);
-  }
-#endif
 };
+#endif
 template <>
 struct CBlas<platform::float16> {
  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }

--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -14,9 +14,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
+#include "paddle/fluid/platform/dynload/mklml.h"
-#include <mkl_lapacke.h>
-#include <mkl_vml_functions.h>
 #endif
 #ifdef PADDLE_USE_OPENBLAS

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -12,3 +12,7 @@ if (CUPTI_FOUND)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+if (WITH_MKLML)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
+endif()
+# TODO(TJ): add iomp, mkldnn?
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -49,6 +49,8 @@ DEFINE_string(
    tensorrt_dir, "",
    "Specify path for loading tensorrt library, such as libnvinfer.so.");
+DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -206,6 +208,14 @@ void* GetTensorRtDsoHandle() {
 #endif
 }
+void* GetMKLMLDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
+#endif
+}
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -26,6 +26,7 @@ void* GetWarpCTCDsoHandle();
 void* GetLapackDsoHandle();
 void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
+void* GetMKLMLDsoHandle();
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/mklml.cc
+++ b/paddle/fluid/platform/dynload/mklml.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/dynload/mklml.h"
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag mklml_dso_flag;
+void* mklml_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+MKLML_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <dlfcn.h>
+#include <mkl.h>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag mklml_dso_flag;
+extern void* mklml_dso_handle;
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mklml routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+      using mklmlFunc = decltype(&::__name);                               \
+      std::call_once(mklml_dso_flag, []() {                                \
+        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
+      });                                                                  \
+      static void* p_##_name = dlsym(mklml_dso_handle, #__name);           \
+      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
+    }                                                                      \
+  };                                                                       \
+  extern DynLoad__##__name __name
+#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+#define MKLML_ROUTINE_EACH(__macro) \
+  __macro(cblas_sgemm);             \
+  __macro(cblas_saxpy);             \
+  __macro(cblas_scopy);             \
+  __macro(cblas_sgemv);             \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm);             \
+  __macro(cblas_daxpy);             \
+  __macro(cblas_dcopy);             \
+  __macro(cblas_dgemv);             \
+  __macro(cblas_dgemm_batch);       \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(MKL_Set_Num_Threads)
+MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+#undef DYNAMIC_LOAD_MKLML_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle