From f503f129250cae7e2d0531e93c3d948bc1f906ef Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 20 Jun 2018 11:29:22 +0800 Subject: [PATCH] enable dynamic load mklml lib on fluid --- cmake/external/openblas.cmake | 7 +- .../tests/book/test_inference_nlp.cc | 4 +- paddle/fluid/operators/math/blas.h | 7 +- paddle/fluid/operators/math/blas_impl.h | 91 ++++++++++++++----- paddle/fluid/operators/math/math_function.h | 4 +- paddle/fluid/platform/dynload/CMakeLists.txt | 4 + .../fluid/platform/dynload/dynamic_loader.cc | 10 ++ .../fluid/platform/dynload/dynamic_loader.h | 1 + paddle/fluid/platform/dynload/mklml.cc | 30 ++++++ paddle/fluid/platform/dynload/mklml.h | 71 +++++++++++++++ 10 files changed, 193 insertions(+), 36 deletions(-) create mode 100644 paddle/fluid/platform/dynload/mklml.cc create mode 100644 paddle/fluid/platform/dynload/mklml.h diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 4a49a92f2b1..ce6a88b51dc 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -114,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";") ADD_LIBRARY(cblas STATIC ${dummyfile}) -TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) + +IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") + TARGET_LINK_LIBRARIES(cblas dynload_mklml) +ELSE() + TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) +ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index cbba8b9d559..03b0b694633 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/operators/math/blas.h" #ifdef PADDLE_WITH_MKLML -#include #include #endif @@ -164,7 +164,7 @@ TEST(inference, nlp) { // only use 1 thread number per std::thread omp_set_dynamic(0); omp_set_num_threads(1); - mkl_set_num_threads(1); + paddle::operators::math::SetNumThreads(1); #endif double start_ms = 0, stop_ms = 0; diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 6207d14ecdc..a907d6a71b7 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -18,10 +18,7 @@ #include "paddle/fluid/framework/tensor.h" #ifdef PADDLE_WITH_MKLML -#include -#include -#include -#include +#include "paddle/fluid/platform/dynload/mklml.h" #endif #ifdef PADDLE_USE_OPENBLAS @@ -55,7 +52,7 @@ static void SetNumThreads(int num_threads) { openblas_set_num_threads(real_num_threads); #elif defined(PADDLE_WITH_MKLML) int real_num_threads = num_threads > 1 ? num_threads : 1; - mkl_set_num_threads(real_num_threads); + platform::dynload::MKL_Set_Num_Threads(real_num_threads); #else PADDLE_ENFORCE(false, "To be implemented."); #endif diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index ae20406bc21..2ce94cfc938 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -22,61 +22,109 @@ namespace math { template struct CBlas; +#ifdef PADDLE_WITH_MKLML template <> struct CBlas { template static void GEMM(ARGS... args) { - cblas_sgemm(args...); + platform::dynload::cblas_sgemm(args...); } template static void AXPY(ARGS... args) { - cblas_saxpy(args...); + platform::dynload::cblas_saxpy(args...); + } + + template + static void VCOPY(ARGS... args) { + platform::dynload::cblas_scopy(args...); + } + + template + static void GEMV(ARGS... args) { + platform::dynload::cblas_sgemv(args...); + } + + template + static void GEMM_BATCH(ARGS... args) { + platform::dynload::cblas_sgemm_batch(args...); } -#ifdef PADDLE_WITH_MKLML template static void VADD(ARGS... args) { - vsAdd(args...); + platform::dynload::vsAdd(args...); + } +}; + +template <> +struct CBlas { + template + static void GEMM(ARGS... args) { + platform::dynload::cblas_dgemm(args...); + } + + template + static void AXPY(ARGS... args) { + platform::dynload::cblas_daxpy(args...); } -#endif template static void VCOPY(ARGS... args) { - cblas_scopy(args...); + platform::dynload::cblas_dcopy(args...); } template static void GEMV(ARGS... args) { - cblas_sgemv(args...); + platform::dynload::cblas_dgemv(args...); } -#ifdef PADDLE_WITH_MKLML template static void GEMM_BATCH(ARGS... args) { - cblas_sgemm_batch(args...); + platform::dynload::cblas_dgemm_batch(args...); + } + + template + static void VADD(ARGS... args) { + platform::dynload::vdAdd(args...); } -#endif }; +#else + template <> -struct CBlas { +struct CBlas { template static void GEMM(ARGS... args) { - cblas_dgemm(args...); + cblas_sgemm(args...); } template static void AXPY(ARGS... args) { - cblas_daxpy(args...); + cblas_saxpy(args...); } -#ifdef PADDLE_WITH_MKLML template - static void VADD(ARGS... args) { - vdAdd(args...); + static void VCOPY(ARGS... args) { + cblas_scopy(args...); + } + + template + static void GEMV(ARGS... args) { + cblas_sgemv(args...); + } +}; + +template <> +struct CBlas { + template + static void GEMM(ARGS... args) { + cblas_dgemm(args...); + } + + template + static void AXPY(ARGS... args) { + cblas_daxpy(args...); } -#endif template static void VCOPY(ARGS... args) { @@ -87,15 +135,8 @@ struct CBlas { static void GEMV(ARGS... args) { cblas_dgemv(args...); } - -#ifdef PADDLE_WITH_MKLML - template - static void GEMM_BATCH(ARGS... args) { - cblas_dgemm_batch(args...); - } -#endif }; - +#endif template <> struct CBlas { static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index 8b296b6a07c..56a039d3cec 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -14,9 +14,7 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_MKLML -#include -#include -#include +#include "paddle/fluid/platform/dynload/mklml.h" #endif #ifdef PADDLE_USE_OPENBLAS diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 364c4901b29..68fa5765434 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -12,3 +12,7 @@ if (CUPTI_FOUND) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +if (WITH_MKLML) + cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) +endif() +# TODO(TJ): add iomp, mkldnn? diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 19c01dc5a96..34fbccddc2b 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -49,6 +49,8 @@ DEFINE_string( tensorrt_dir, "", "Specify path for loading tensorrt library, such as libnvinfer.so."); +DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); + namespace paddle { namespace platform { namespace dynload { @@ -206,6 +208,14 @@ void* GetTensorRtDsoHandle() { #endif } +void* GetMKLMLDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 0de3559b608..ca87dc47f35 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -26,6 +26,7 @@ void* GetWarpCTCDsoHandle(); void* GetLapackDsoHandle(); void* GetNCCLDsoHandle(); void* GetTensorRtDsoHandle(); +void* GetMKLMLDsoHandle(); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc new file mode 100644 index 00000000000..0f61a5e09b3 --- /dev/null +++ b/paddle/fluid/platform/dynload/mklml.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mklml.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag mklml_dso_flag; +void* mklml_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MKLML_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h new file mode 100644 index 00000000000..17acefe8cde --- /dev/null +++ b/paddle/fluid/platform/dynload/mklml.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag mklml_dso_flag; +extern void* mklml_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mklml routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using mklmlFunc = decltype(&::__name); \ + std::call_once(mklml_dso_flag, []() { \ + mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ + }); \ + static void* p_##_name = dlsym(mklml_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name) + +#define MKLML_ROUTINE_EACH(__macro) \ + __macro(cblas_sgemm); \ + __macro(cblas_saxpy); \ + __macro(cblas_scopy); \ + __macro(cblas_sgemv); \ + __macro(cblas_sgemm_batch); \ + __macro(cblas_dgemm); \ + __macro(cblas_daxpy); \ + __macro(cblas_dcopy); \ + __macro(cblas_dgemv); \ + __macro(cblas_dgemm_batch); \ + __macro(vsAdd); \ + __macro(vdAdd); \ + __macro(MKL_Set_Num_Threads) + +MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); + +#undef DYNAMIC_LOAD_MKLML_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle -- GitLab