From 3c1dc6f6d728541a61365ef09c208a49e29bc22c Mon Sep 17 00:00:00 2001 From: Wilber Date: Mon, 24 Jan 2022 12:01:01 +0800 Subject: [PATCH] [PTEN] Move dynload from fluid to pten. (#39120) * move dynload from fluid to pten. * fix ci compile * fix windows ci compile. * update * update * fix compile error --- paddle/fluid/distributed/service/brpc_utils.h | 6 +- .../table/depends/large_scale_kv.h | 8 +- paddle/fluid/framework/device_worker.h | 2 +- paddle/fluid/framework/io/shell.cc | 1 + paddle/fluid/framework/io/shell.h | 2 +- paddle/fluid/framework/trainer.h | 2 +- .../inference/analysis/analyzer_tester.cc | 2 +- paddle/fluid/inference/analysis/helper.h | 2 +- .../analysis/passes/memory_optimize_pass.h | 2 +- paddle/fluid/inference/api/helper.h | 2 +- paddle/fluid/inference/tests/test_helper.h | 2 +- paddle/fluid/operators/activation_op.cc | 2 +- paddle/fluid/operators/save_combine_op.h | 2 +- .../fluid/platform/device/npu/dynload/hccl.h | 2 +- paddle/fluid/platform/dynload/CMakeLists.txt | 18 +- paddle/fluid/platform/dynload/cublas.cc | 2 - paddle/fluid/platform/dynload/cublas.h | 31 +- paddle/fluid/platform/dynload/cublasLt.cc | 2 - paddle/fluid/platform/dynload/cublasLt.h | 26 +- paddle/fluid/platform/dynload/cuda_driver.cc | 9 +- paddle/fluid/platform/dynload/cuda_driver.h | 27 +- paddle/fluid/platform/dynload/cudnn.cc | 18 +- paddle/fluid/platform/dynload/cudnn.h | 36 +- paddle/fluid/platform/dynload/cufft.cc | 18 +- paddle/fluid/platform/dynload/cufft.h | 25 +- paddle/fluid/platform/dynload/cupti.cc | 3 - paddle/fluid/platform/dynload/cupti.h | 20 +- paddle/fluid/platform/dynload/curand.cc | 3 - paddle/fluid/platform/dynload/curand.h | 23 +- paddle/fluid/platform/dynload/cusolver.cc | 3 - paddle/fluid/platform/dynload/cusolver.h | 28 +- paddle/fluid/platform/dynload/cusparse.cc | 3 - paddle/fluid/platform/dynload/cusparse.h | 28 +- .../fluid/platform/dynload/dynamic_loader.cc | 541 +--------------- paddle/fluid/platform/dynload/hipfft.cc | 3 - paddle/fluid/platform/dynload/hipfft.h | 64 +- paddle/fluid/platform/dynload/hiprand.cc | 3 - paddle/fluid/platform/dynload/hiprand.h | 23 +- paddle/fluid/platform/dynload/hiprtc.cc | 10 +- paddle/fluid/platform/dynload/hiprtc.h | 25 +- paddle/fluid/platform/dynload/lapack.cc | 4 - paddle/fluid/platform/dynload/lapack.h | 110 +--- paddle/fluid/platform/dynload/miopen.cc | 18 +- paddle/fluid/platform/dynload/miopen.h | 73 +-- paddle/fluid/platform/dynload/mklml.cc | 3 - paddle/fluid/platform/dynload/mklml.h | 25 +- paddle/fluid/platform/dynload/mklrt.h | 16 +- paddle/fluid/platform/dynload/nccl.cc | 3 - paddle/fluid/platform/dynload/nccl.h | 30 +- paddle/fluid/platform/dynload/nvjpeg.cc | 3 - paddle/fluid/platform/dynload/nvjpeg.h | 23 +- paddle/fluid/platform/dynload/nvrtc.cc | 10 +- paddle/fluid/platform/dynload/nvrtc.h | 25 +- paddle/fluid/platform/dynload/nvtx.cc | 3 - paddle/fluid/platform/dynload/nvtx.h | 25 +- paddle/fluid/platform/dynload/rccl.cc | 3 - paddle/fluid/platform/dynload/rccl.h | 26 +- paddle/fluid/platform/dynload/rocblas.cc | 2 - paddle/fluid/platform/dynload/rocblas.h | 30 +- paddle/fluid/platform/dynload/rocm_driver.cc | 9 +- paddle/fluid/platform/dynload/rocm_driver.h | 25 +- paddle/fluid/platform/dynload/warpctc.cc | 3 - paddle/fluid/platform/dynload/warpctc.h | 21 +- paddle/fluid/platform/enforce.h | 32 +- paddle/fluid/platform/os_info.cc | 2 + paddle/fluid/platform/os_info.h | 2 +- paddle/fluid/platform/timer.h | 2 +- paddle/pten/backends/CMakeLists.txt | 3 + paddle/pten/backends/dynload/CMakeLists.txt | 57 ++ paddle/pten/backends/dynload/cublas.cc | 38 ++ paddle/pten/backends/dynload/cublas.h | 136 ++++ paddle/pten/backends/dynload/cublasLt.cc | 27 + paddle/pten/backends/dynload/cublasLt.h | 75 +++ paddle/pten/backends/dynload/cuda_driver.cc | 36 ++ paddle/pten/backends/dynload/cuda_driver.h | 82 +++ paddle/pten/backends/dynload/cudnn.cc | 63 ++ paddle/pten/backends/dynload/cudnn.h | 199 ++++++ paddle/pten/backends/dynload/cufft.cc | 42 ++ paddle/pten/backends/dynload/cufft.h | 111 ++++ paddle/pten/backends/dynload/cupti.cc | 32 + paddle/pten/backends/dynload/cupti.h | 74 +++ .../pten/backends/dynload/cupti_lib_path.h.in | 17 + paddle/pten/backends/dynload/curand.cc | 28 + paddle/pten/backends/dynload/curand.h | 53 ++ paddle/pten/backends/dynload/cusolver.cc | 36 ++ paddle/pten/backends/dynload/cusolver.h | 123 ++++ paddle/pten/backends/dynload/cusparse.cc | 37 ++ paddle/pten/backends/dynload/cusparse.h | 92 +++ .../pten/backends/dynload/dynamic_loader.cc | 585 ++++++++++++++++++ paddle/pten/backends/dynload/dynamic_loader.h | 52 ++ paddle/pten/backends/dynload/hipfft.cc | 28 + paddle/pten/backends/dynload/hipfft.h | 122 ++++ paddle/pten/backends/dynload/hiprand.cc | 28 + paddle/pten/backends/dynload/hiprand.h | 54 ++ paddle/pten/backends/dynload/hiprtc.cc | 34 + paddle/pten/backends/dynload/hiprtc.h | 62 ++ paddle/pten/backends/dynload/lapack.cc | 29 + paddle/pten/backends/dynload/lapack.h | 340 ++++++++++ paddle/pten/backends/dynload/miopen.cc | 67 ++ paddle/pten/backends/dynload/miopen.h | 196 ++++++ paddle/pten/backends/dynload/mklml.cc | 33 + paddle/pten/backends/dynload/mklml.h | 123 ++++ paddle/pten/backends/dynload/mklrt.cc | 50 ++ paddle/pten/backends/dynload/mklrt.h | 79 +++ paddle/pten/backends/dynload/nccl.cc | 44 ++ paddle/pten/backends/dynload/nccl.h | 86 +++ paddle/pten/backends/dynload/nvjpeg.cc | 25 + paddle/pten/backends/dynload/nvjpeg.h | 51 ++ paddle/pten/backends/dynload/nvrtc.cc | 34 + paddle/pten/backends/dynload/nvrtc.h | 63 ++ paddle/pten/backends/dynload/nvtx.cc | 29 + paddle/pten/backends/dynload/nvtx.h | 51 ++ .../platform => pten/backends/dynload}/port.h | 0 paddle/pten/backends/dynload/rccl.cc | 36 ++ paddle/pten/backends/dynload/rccl.h | 74 +++ paddle/pten/backends/dynload/rocblas.cc | 38 ++ paddle/pten/backends/dynload/rocblas.h | 112 ++++ paddle/pten/backends/dynload/rocm_driver.cc | 33 + paddle/pten/backends/dynload/rocm_driver.h | 66 ++ paddle/pten/backends/dynload/tensorrt.cc | 83 +++ paddle/pten/backends/dynload/tensorrt.h | 118 ++++ paddle/pten/backends/dynload/warpctc.cc | 28 + paddle/pten/backends/dynload/warpctc.h | 64 ++ 123 files changed, 4509 insertions(+), 1326 deletions(-) create mode 100644 paddle/pten/backends/dynload/CMakeLists.txt create mode 100644 paddle/pten/backends/dynload/cublas.cc create mode 100644 paddle/pten/backends/dynload/cublas.h create mode 100644 paddle/pten/backends/dynload/cublasLt.cc create mode 100644 paddle/pten/backends/dynload/cublasLt.h create mode 100644 paddle/pten/backends/dynload/cuda_driver.cc create mode 100644 paddle/pten/backends/dynload/cuda_driver.h create mode 100644 paddle/pten/backends/dynload/cudnn.cc create mode 100644 paddle/pten/backends/dynload/cudnn.h create mode 100644 paddle/pten/backends/dynload/cufft.cc create mode 100644 paddle/pten/backends/dynload/cufft.h create mode 100644 paddle/pten/backends/dynload/cupti.cc create mode 100644 paddle/pten/backends/dynload/cupti.h create mode 100644 paddle/pten/backends/dynload/cupti_lib_path.h.in create mode 100644 paddle/pten/backends/dynload/curand.cc create mode 100644 paddle/pten/backends/dynload/curand.h create mode 100644 paddle/pten/backends/dynload/cusolver.cc create mode 100644 paddle/pten/backends/dynload/cusolver.h create mode 100644 paddle/pten/backends/dynload/cusparse.cc create mode 100644 paddle/pten/backends/dynload/cusparse.h create mode 100644 paddle/pten/backends/dynload/dynamic_loader.cc create mode 100644 paddle/pten/backends/dynload/dynamic_loader.h create mode 100644 paddle/pten/backends/dynload/hipfft.cc create mode 100644 paddle/pten/backends/dynload/hipfft.h create mode 100644 paddle/pten/backends/dynload/hiprand.cc create mode 100644 paddle/pten/backends/dynload/hiprand.h create mode 100644 paddle/pten/backends/dynload/hiprtc.cc create mode 100644 paddle/pten/backends/dynload/hiprtc.h create mode 100644 paddle/pten/backends/dynload/lapack.cc create mode 100644 paddle/pten/backends/dynload/lapack.h create mode 100644 paddle/pten/backends/dynload/miopen.cc create mode 100644 paddle/pten/backends/dynload/miopen.h create mode 100644 paddle/pten/backends/dynload/mklml.cc create mode 100644 paddle/pten/backends/dynload/mklml.h create mode 100644 paddle/pten/backends/dynload/mklrt.cc create mode 100644 paddle/pten/backends/dynload/mklrt.h create mode 100644 paddle/pten/backends/dynload/nccl.cc create mode 100644 paddle/pten/backends/dynload/nccl.h create mode 100644 paddle/pten/backends/dynload/nvjpeg.cc create mode 100644 paddle/pten/backends/dynload/nvjpeg.h create mode 100644 paddle/pten/backends/dynload/nvrtc.cc create mode 100644 paddle/pten/backends/dynload/nvrtc.h create mode 100644 paddle/pten/backends/dynload/nvtx.cc create mode 100644 paddle/pten/backends/dynload/nvtx.h rename paddle/{fluid/platform => pten/backends/dynload}/port.h (100%) create mode 100644 paddle/pten/backends/dynload/rccl.cc create mode 100644 paddle/pten/backends/dynload/rccl.h create mode 100644 paddle/pten/backends/dynload/rocblas.cc create mode 100644 paddle/pten/backends/dynload/rocblas.h create mode 100644 paddle/pten/backends/dynload/rocm_driver.cc create mode 100644 paddle/pten/backends/dynload/rocm_driver.h create mode 100644 paddle/pten/backends/dynload/tensorrt.cc create mode 100644 paddle/pten/backends/dynload/tensorrt.h create mode 100644 paddle/pten/backends/dynload/warpctc.cc create mode 100644 paddle/pten/backends/dynload/warpctc.h diff --git a/paddle/fluid/distributed/service/brpc_utils.h b/paddle/fluid/distributed/service/brpc_utils.h index 556bbb1048e..ebae710acc2 100644 --- a/paddle/fluid/distributed/service/brpc_utils.h +++ b/paddle/fluid/distributed/service/brpc_utils.h @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" namespace butil { class IOBuf; @@ -78,11 +78,11 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg, const framework::Scope* scope); void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg, - butil::IOBufBytesIterator& iobuf, + butil::IOBufBytesIterator& iobuf, // NOLINT const platform::DeviceContext& ctx); void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, - butil::IOBufBytesIterator& iobuf, + butil::IOBufBytesIterator& iobuf, // NOLINT const platform::DeviceContext& ctx); std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port); diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index 3b00f1d6ccc..3408ef5f91a 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -40,9 +40,9 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/backends/dynload/port.h" namespace paddle { namespace distributed { @@ -202,7 +202,7 @@ class ValueBlock { // value = _alloc.acquire(value_length_); table[id] = value; } else { - value = (VALUE *)(void *)(res->second); + value = (VALUE *)(void *)(res->second); // NOLINT } return value; } @@ -282,8 +282,8 @@ class ValueBlock { value->unseen_days_++; if (value->unseen_days_ >= threshold) { butil::return_object(iter->second); - //_alloc.release(iter->second); - //_alloc.release(value); + // _alloc.release(iter->second); + // _alloc.release(value); iter = table.erase(iter); } else { ++iter; diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index d8b14fc0d4c..7aadc856129 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -38,8 +38,8 @@ limitations under the License. */ #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/reader/blocking_queue.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/timer.h" +#include "paddle/pten/backends/dynload/port.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc index 004dc71d07b..f01894f2cf4 100644 --- a/paddle/fluid/framework/io/shell.cc +++ b/paddle/fluid/framework/io/shell.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "paddle/fluid/framework/io/shell.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h index 6fd00a516de..e92560980f5 100644 --- a/paddle/fluid/framework/io/shell.h +++ b/paddle/fluid/framework/io/shell.h @@ -34,8 +34,8 @@ #include #include -#include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/backends/dynload/port.h" #if defined(__arm__) || defined(__aarch64__) || defined(__ARM_NEON) || \ defined(__ARM_NEON__) diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 91d618970e3..6c97c7fefb1 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -34,7 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/trainer_desc.pb.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/reader/blocking_queue.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 135ef6a9706..1ef633d0f12 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -19,7 +19,7 @@ #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 61c5d8d0e4f..92989eed7c0 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" #ifdef _WIN32 #include diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index 57052243d2f..e418d412b55 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -20,7 +20,7 @@ #include #include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index c6d25137594..6c0707e3475 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -31,8 +31,8 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/printf.h" +#include "paddle/pten/backends/dynload/port.h" extern std::string paddle::framework::DataTypeToString( const framework::proto::VarType::Type type); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index cf8a32ba94a..ed0c8e51ac9 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -22,8 +22,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/errors.h" -#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/pten/backends/dynload/port.h" DECLARE_bool(use_mkldnn); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index c5ca1fd0e8c..e5ba46f3128 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" DECLARE_bool(use_mkldnn); diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index 6e6c826a228..0aa39c9af5c 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/device/npu/dynload/hccl.h b/paddle/fluid/platform/device/npu/dynload/hccl.h index a56180ce2d4..2c251ceb549 100644 --- a/paddle/fluid/platform/device/npu/dynload/hccl.h +++ b/paddle/fluid/platform/device/npu/dynload/hccl.h @@ -20,7 +20,7 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" #define HCOM_GROUP_PREFIX "HCOM_GROUP_" diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 7695f8b5881..49391a65b18 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) +cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader) list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc) @@ -34,24 +34,24 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) if(WITH_ROCM) - hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) + hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda) + cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc) elseif (WITH_ASCEND_CL) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl) + cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc) else() - nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) + nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda) + cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc) endif() if (WITH_MKLML) - cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) + cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml) endif() -cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader) +cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack) add_dependencies(dynload_lapack extern_lapack) # TODO(TJ): add iomp, mkldnn? if (MKL_FOUND AND WITH_ONEMKL) message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}") - cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader) + cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt) target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE}) endif() diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc index 41648c32fe6..aee0f2c9e39 100644 --- a/paddle/fluid/platform/dynload/cublas.cc +++ b/paddle/fluid/platform/dynload/cublas.cc @@ -17,8 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { namespace dynload { -std::once_flag cublas_dso_flag; -void *cublas_dso_handle = nullptr; #define DEFINE_WRAP(__name) DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 17ae4d5bf03..eb2c019af9b 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -20,16 +20,12 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/cublas.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag cublas_dso_flag; -extern void *cublas_dso_handle; - /** * The following macro definition can generate structs * (for each function) to dynamic load cublas routine @@ -37,19 +33,8 @@ extern void *cublas_dso_handle; * * note: default dynamic linked libs */ -#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using cublas_func = \ - decltype(::__name(std::declval()...)) (*)(Args...); \ - std::call_once(cublas_dso_flag, []() { \ - cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(cublas_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ @@ -99,7 +84,7 @@ extern void *cublas_dso_handle; __macro(cublasSgetrsBatched); \ __macro(cublasDgetrsBatched); -CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +CUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // APIs available after CUDA 8.0 #if CUDA_VERSION >= 8000 @@ -111,7 +96,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) __macro(cublasZgemmStridedBatched); \ __macro(cublasHgemmStridedBatched); -CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +CUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif // APIs available after CUDA 9.0 @@ -120,7 +105,7 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) __macro(cublasSetMathMode); \ __macro(cublasGetMathMode); -CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +CUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif // APIs available after CUDA 9.1 @@ -129,10 +114,10 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) __macro(cublasGemmBatchedEx); \ __macro(cublasGemmStridedBatchedEx); -CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +CUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif -#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cublasLt.cc b/paddle/fluid/platform/dynload/cublasLt.cc index 78f952985c8..891df24034b 100644 --- a/paddle/fluid/platform/dynload/cublasLt.cc +++ b/paddle/fluid/platform/dynload/cublasLt.cc @@ -17,8 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { namespace dynload { -std::once_flag cublasLt_dso_flag; -void *cublasLt_dso_handle = nullptr; #define DEFINE_WRAP(__name) DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/cublasLt.h b/paddle/fluid/platform/dynload/cublasLt.h index f4e04c94e04..aa605be1d69 100644 --- a/paddle/fluid/platform/dynload/cublasLt.h +++ b/paddle/fluid/platform/dynload/cublasLt.h @@ -19,16 +19,12 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/cublasLt.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag cublasLt_dso_flag; -extern void *cublasLt_dso_handle; - /** * The following macro definition can generate structs * (for each function) to dynamic load cublasLt routine @@ -36,20 +32,8 @@ extern void *cublasLt_dso_handle; * * note: default dynamic linked libs */ -#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using cublasLt_func = \ - decltype(::__name(std::declval()...)) (*)(Args...); \ - std::call_once(cublasLt_dso_flag, []() { \ - cublasLt_dso_handle = \ - paddle::platform::dynload::GetCublasLtDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name // APIs available after CUDA 10.1 @@ -69,10 +53,10 @@ extern void *cublasLt_dso_handle; __macro(cublasLtMatrixTransformDescDestroy); \ __macro(cublasLtMatrixTransformDescSetAttribute); -CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP) +CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP) // #endif -#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc index 6110e6b6ba9..f4c814979e5 100644 --- a/paddle/fluid/platform/dynload/cuda_driver.cc +++ b/paddle/fluid/platform/dynload/cuda_driver.cc @@ -13,14 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/cuda_driver.h" +#include "paddle/pten/backends/dynload/cuda_driver.h" namespace paddle { namespace platform { namespace dynload { -std::once_flag cuda_dso_flag; -void* cuda_dso_handle = nullptr; - #define DEFINE_WRAP(__name) DynLoad__##__name __name #if CUDA_VERSION >= 10020 @@ -28,10 +26,7 @@ CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP); #endif CUDA_ROUTINE_EACH(DEFINE_WRAP); -bool HasCUDADriver() { - std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); }); - return cuda_dso_handle != nullptr; -} +bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); } } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h index b5212c64cd1..4d48e9f778e 100644 --- a/paddle/fluid/platform/dynload/cuda_driver.h +++ b/paddle/fluid/platform/dynload/cuda_driver.h @@ -17,30 +17,17 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/cuda_driver.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag cuda_dso_flag; -extern void* cuda_dso_handle; extern bool HasCUDADriver(); -#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using cuda_func = decltype(&::__name); \ - std::call_once(cuda_dso_flag, []() { \ - cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \ - }); \ - static void* p_##__name = dlsym(cuda_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name /** * include all needed cuda driver functions @@ -72,12 +59,12 @@ extern bool HasCUDADriver(); __macro(cuMemRelease); \ __macro(cuMemAddressFree) -CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); +CUDA_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP); #endif -CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); +CUDA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP); -#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index 366762401c7..1d89f526dc1 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -13,13 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/cudnn.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/backends/dynload/cudnn.h" namespace paddle { namespace platform { namespace dynload { -std::once_flag cudnn_dso_flag; -void* cudnn_dso_handle = nullptr; #define DEFINE_WRAP(__name) DynLoad__##__name __name @@ -45,19 +43,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP); #endif -bool HasCUDNN() { - std::call_once(cudnn_dso_flag, - []() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); - return cudnn_dso_handle != nullptr; -} - -void EnforceCUDNNLoaded(const char* fn_name) { - PADDLE_ENFORCE_NOT_NULL( - cudnn_dso_handle, - platform::errors::PreconditionNotMet( - "Cannot load cudnn shared library. Cannot invoke method %s.", - fn_name)); -} +bool HasCUDNN() { return pten::dynload::HasCUDNN(); } } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 3420c38fe96..a46c7303cfc 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -18,32 +18,17 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/cudnn.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag cudnn_dso_flag; -extern void* cudnn_dso_handle; extern bool HasCUDNN(); -extern void EnforceCUDNNLoaded(const char* fn_name); -#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using cudnn_func = decltype(&::__name); \ - std::call_once(cudnn_dso_flag, []() { \ - cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ - }); \ - EnforceCUDNNLoaded(#__name); \ - static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name /** * include all needed cudnn functions in HPPL @@ -127,7 +112,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnGetActivationDescriptor); \ __macro(cudnnDestroyActivationDescriptor); \ __macro(cudnnSetRNNDescriptor_v6); -CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +CUDNN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \ @@ -135,7 +120,8 @@ CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(cudnnGetConvolutionForwardAlgorithm); \ __macro(cudnnGetConvolutionBackwardDataAlgorithm); \ __macro(cudnnSetRNNDescriptor); -CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8( + PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif #if CUDNN_VERSION >= 7001 @@ -153,7 +139,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \ __macro(cudnnGetConvolutionForwardAlgorithm_v7); \ __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount); -CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +CUDNN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif #if CUDNN_VERSION >= 7201 @@ -166,7 +152,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(cudnnRNNBackwardDataEx); \ __macro(cudnnRNNBackwardWeightsEx); \ __macro(cudnnRNNForwardInferenceEx); -CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif #if CUDNN_VERSION >= 7401 @@ -176,7 +162,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ __macro(cudnnBatchNormalizationBackwardEx); \ __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); -CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +CUDNN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif #if CUDNN_VERSION >= 8000 @@ -192,7 +178,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(cudnnSetFusedOpsConstParamPackAttribute); \ __macro(cudnnSetFusedOpsVariantParamPackAttribute); \ __macro(cudnnMakeFusedOpsPlan); -CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +CUDNN_DNN_ROUTINE_EACH_R8(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif } // namespace dynload diff --git a/paddle/fluid/platform/dynload/cufft.cc b/paddle/fluid/platform/dynload/cufft.cc index a125fb72260..3f3534112e4 100644 --- a/paddle/fluid/platform/dynload/cufft.cc +++ b/paddle/fluid/platform/dynload/cufft.cc @@ -13,31 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/cufft.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/backends/dynload/cufft.h" namespace paddle { namespace platform { namespace dynload { -std::once_flag cufft_dso_flag; -void* cufft_dso_handle = nullptr; #define DEFINE_WRAP(__name) DynLoad__##__name __name CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); -bool HasCUFFT() { - std::call_once(cufft_dso_flag, - []() { cufft_dso_handle = GetCUFFTDsoHandle(); }); - return cufft_dso_handle != nullptr; -} - -void EnforceCUFFTLoaded(const char* fn_name) { - PADDLE_ENFORCE_NOT_NULL( - cufft_dso_handle, - platform::errors::PreconditionNotMet( - "Cannot load cufft shared library. Cannot invoke method %s.", - fn_name)); -} +bool HasCUFFT() { return pten::dynload::HasCUFFT(); } } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/cufft.h b/paddle/fluid/platform/dynload/cufft.h index ef924d7b5ee..4d95edeefc0 100644 --- a/paddle/fluid/platform/dynload/cufft.h +++ b/paddle/fluid/platform/dynload/cufft.h @@ -19,32 +19,17 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/cufft.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag cufft_dso_flag; -extern void* cufft_dso_handle; extern bool HasCUFFT(); -extern void EnforceCUFFTLoaded(const char* fn_name); -#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using cufft_func = decltype(&::__name); \ - std::call_once(cufft_dso_flag, []() { \ - cufft_dso_handle = paddle::platform::dynload::GetCUFFTDsoHandle(); \ - }); \ - EnforceCUFFTLoaded(#__name); \ - static void* p_##__name = dlsym(cufft_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name /** * include all needed cufft functions in HPPL @@ -104,7 +89,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name); __macro(cufftXtExecDescriptor); \ __macro(cufftXtSetWorkAreaPolicy); -CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP) +CUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP) } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/cupti.cc b/paddle/fluid/platform/dynload/cupti.cc index d8381580c90..5e2c8630617 100644 --- a/paddle/fluid/platform/dynload/cupti.cc +++ b/paddle/fluid/platform/dynload/cupti.cc @@ -20,9 +20,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag cupti_dso_flag; -void *cupti_dso_handle = nullptr; - #define DEFINE_WRAP(__name) DynLoad__##__name __name CUPTI_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index 49bfdce4d38..c6d844cee9d 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -19,16 +19,12 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/cupti.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag cupti_dso_flag; -extern void *cupti_dso_handle; - /** * The following macro definition can generate structs * (for each function) to dynamic load cupti routine @@ -36,18 +32,8 @@ extern void *cupti_dso_handle; * * note: default dynamic linked libs */ -#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - inline CUptiResult CUPTIAPI operator()(Args... args) { \ - using cuptiFunc = decltype(&::__name); \ - std::call_once(cupti_dso_flag, []() { \ - cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(cupti_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define CUPTI_ROUTINE_EACH(__macro) \ diff --git a/paddle/fluid/platform/dynload/curand.cc b/paddle/fluid/platform/dynload/curand.cc index ce83ebc84fe..9a6686515ea 100644 --- a/paddle/fluid/platform/dynload/curand.cc +++ b/paddle/fluid/platform/dynload/curand.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag curand_dso_flag; -void *curand_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 7a160664bc2..89b08bf7097 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -16,27 +16,14 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/curand.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag curand_dso_flag; -extern void *curand_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - curandStatus_t operator()(Args... args) { \ - using curandFunc = decltype(&::__name); \ - std::call_once(curand_dso_flag, []() { \ - curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(curand_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define CURAND_RAND_ROUTINE_EACH(__macro) \ @@ -48,7 +35,7 @@ extern void *curand_dso_handle; __macro(curandGenerateNormal); \ __macro(curandDestroyGenerator); -CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); +CURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/cusolver.cc b/paddle/fluid/platform/dynload/cusolver.cc index d4163e9a743..bf8394f3f02 100644 --- a/paddle/fluid/platform/dynload/cusolver.cc +++ b/paddle/fluid/platform/dynload/cusolver.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag cusolver_dso_flag; -void *cusolver_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name CUSOLVER_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h index 63661a93cfd..e4b29c63773 100644 --- a/paddle/fluid/platform/dynload/cusolver.h +++ b/paddle/fluid/platform/dynload/cusolver.h @@ -17,28 +17,14 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/cusolver.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag cusolver_dso_flag; -extern void *cusolver_dso_handle; -#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - cusolverStatus_t operator()(Args... args) { \ - using cusolverFunc = decltype(&::__name); \ - std::call_once(cusolver_dso_flag, []() { \ - cusolver_dso_handle = \ - paddle::platform::dynload::GetCusolverDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define CUSOLVER_ROUTINE_EACH(__macro) \ @@ -62,7 +48,7 @@ extern void *cusolver_dso_handle; __macro(cusolverDnCheevd); \ __macro(cusolverDnZheevd); -CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); +CUSOLVER_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); #if CUDA_VERSION >= 9020 #define CUSOLVER_ROUTINE_EACH_R1(__macro) \ @@ -105,7 +91,7 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); __macro(cusolverDnCungqr); \ __macro(cusolverDnZungqr); -CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) +CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #endif #if CUDA_VERSION >= 9020 @@ -117,10 +103,10 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) __macro(cusolverDnDsyevj); \ __macro(cusolverDnDestroySyevjInfo); -CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) +CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #endif -#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc index be67f121d68..ea7c502e3e6 100644 --- a/paddle/fluid/platform/dynload/cusparse.cc +++ b/paddle/fluid/platform/dynload/cusparse.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag cusparse_dso_flag; -void *cusparse_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name #ifdef CUSPARSE_ROUTINE_EACH diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h index fc842a3377b..596d2b51aec 100644 --- a/paddle/fluid/platform/dynload/cusparse.h +++ b/paddle/fluid/platform/dynload/cusparse.h @@ -17,28 +17,14 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/cusparse.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag cusparse_dso_flag; -extern void *cusparse_dso_handle; -#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - cusparseStatus_t operator()(Args... args) { \ - using cusparseFunc = decltype(&::__name); \ - std::call_once(cusparse_dso_flag, []() { \ - cusparse_dso_handle = \ - paddle::platform::dynload::GetCusparseDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #if defined(PADDLE_WITH_CUDA) @@ -54,7 +40,7 @@ extern void *cusparse_dso_handle; __macro(cusparseSetMatType); \ __macro(cusparseSetMatIndexBase); -CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP); +CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP); // APIs available after CUDA 11.2 #if CUDA_VERSION >= 11020 @@ -74,7 +60,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP); __macro(cusparseSparseToDense_bufferSize); \ __macro(cusparseSparseToDense); -CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) +CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) // APIs available after CUDA 11.3 #if CUDA_VERSION >= 11030 @@ -83,13 +69,13 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) __macro(cusparseSDDMM_preprocess); \ __macro(cusparseSDDMM); -CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) +CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) #endif #endif #endif #endif -#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 905f1aea887..caefb5a4e22 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -15,556 +15,61 @@ limitations under the License. */ #include #include - #include "gflags/gflags.h" -#include "glog/logging.h" -#include "paddle/fluid/platform/dynload/cupti_lib_path.h" -#include "paddle/fluid/platform/enforce.h" - -#if defined(_WIN32) -#include -#endif - -DEFINE_string(cudnn_dir, "", - "Specify path for loading libcudnn.so. For instance, " - "/usr/local/cudnn/lib. If empty [default], dlopen " - "will search cudnn from LD_LIBRARY_PATH"); - -DEFINE_string( - cuda_dir, "", - "Specify path for loading cuda library, such as libcublas, libcublasLt " - "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. " - "If default, dlopen will search cuda from LD_LIBRARY_PATH"); - -DEFINE_string(nccl_dir, "", - "Specify path for loading nccl library, such as libnccl.so. " - "For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); - -DEFINE_string(hccl_dir, "", - "Specify path for loading hccl library, such as libhccl.so. " - "For instance, " - "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If " - "default, " - "dlopen will search hccl from LD_LIBRARY_PATH"); - -DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); - -DEFINE_string( - tensorrt_dir, "", - "Specify path for loading tensorrt library, such as libnvinfer.so."); - -DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); - -DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); - -DEFINE_string(mkl_dir, "", - "Specify path for loading libmkl_rt.so. " - "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/." - "If default, " - "dlopen will search mkl from LD_LIBRARY_PATH"); - -DEFINE_string(op_dir, "", "Specify path for loading user-defined op library."); - -#ifdef PADDLE_WITH_HIP - -DEFINE_string(miopen_dir, "", - "Specify path for loading libMIOpen.so. For instance, " - "/opt/rocm/miopen/lib. If empty [default], dlopen " - "will search miopen from LD_LIBRARY_PATH"); - -DEFINE_string(rocm_dir, "", - "Specify path for loading rocm library, such as librocblas, " - "libmiopen, libhipsparse. For instance, /opt/rocm/lib. " - "If default, dlopen will search rocm from LD_LIBRARY_PATH"); - -DEFINE_string(rccl_dir, "", - "Specify path for loading rccl library, such as librccl.so. " - "For instance, /opt/rocm/rccl/lib. If default, " - "dlopen will search rccl from LD_LIBRARY_PATH"); -#endif +#include "paddle/pten/backends/dynload/dynamic_loader.h" namespace paddle { namespace platform { namespace dynload { -struct PathNode { - PathNode() {} - std::string path = ""; -}; - -static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; - -// NOTE: In order to adapt to the default installation path of cuda -#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) -static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin"; -#else -static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64"; -#endif - -static PathNode s_py_site_pkg_path; - -#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) -static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll"; -static constexpr char* win_cublas_lib = - "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll"; -#if CUDA_VERSION >= 11000 -static constexpr char* win_curand_lib = - "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll"; -static constexpr char* win_nvjpeg_lib = - "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll"; -static constexpr char* win_cusolver_lib = - "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll"; -static constexpr char* win_cusparse_lib = - "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll"; -static constexpr char* win_cufft_lib = - "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll"; -#else -static constexpr char* win_curand_lib = - "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;curand64_" CUDA_VERSION_MAJOR ".dll"; -static constexpr char* win_nvjpeg_lib = - "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll"; -static constexpr char* win_cusolver_lib = - "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll"; -static constexpr char* win_cusparse_lib = - "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll"; -static constexpr char* win_cufft_lib = - "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR - ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll"; -#endif // CUDA_VERSION -#endif - -static inline std::string join(const std::string& part1, - const std::string& part2) { - // directory separator - const char sep = '/'; - if (!part2.empty() && part2.front() == sep) { - return part2; - } - std::string ret; - ret.reserve(part1.size() + part2.size() + 1); - ret = part1; - if (!ret.empty() && ret.back() != sep) { - ret += sep; - } - ret += part2; - return ret; -} - -static inline std::vector split( - const std::string& str, const std::string separator = " ") { - std::vector str_list; - std::string::size_type firstPos; - firstPos = str.find_first_not_of(separator, 0); - std::string::size_type lastPos; - lastPos = str.find_first_of(separator, firstPos); - while (std::string::npos != firstPos && std::string::npos != lastPos) { - str_list.push_back(str.substr(firstPos, lastPos - firstPos)); - firstPos = str.find_first_not_of(separator, lastPos); - lastPos = str.find_first_of(separator, firstPos); - } - if (std::string::npos == lastPos) { - str_list.push_back(str.substr(firstPos, lastPos - firstPos)); - } - return str_list; -} - void SetPaddleLibPath(const std::string& py_site_pkg_path) { - s_py_site_pkg_path.path = py_site_pkg_path; - VLOG(3) << "Set paddle lib path : " << py_site_pkg_path; -} - -static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path, - const std::string& dso_name, - int dynload_flags) { - void* dso_handle = nullptr; - if (!spec_path.empty()) { - // search xxx.so from custom path - VLOG(3) << "Try to find library: " << dso_name - << " from specific path: " << spec_path; - std::string dso_path = join(spec_path, dso_name); - dso_handle = dlopen(dso_path.c_str(), dynload_flags); - } - return dso_handle; -} - -static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, - int dynload_flags) { - // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH - // and /usr/local/lib path - void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); - VLOG(3) << "Try to find library: " << dso_path - << " from default system path."; - -// TODO(chenweihang): This path is used to search which libs? -// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to -// bring System Integrity Projection (SIP), if dso_handle -// is null, search from default package path in Mac OS. -#if defined(__APPLE__) || defined(__OSX__) - if (nullptr == dso_handle) { - dso_handle = - dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags); - } -#endif - - return dso_handle; + pten::dynload::SetPaddleLibPath(py_site_pkg_path); } -/* - * We define three priorities for dynamic library search: - * - * First: Search for the path specified by the user - * Second: Search the system default path - * Third: Search for a special path corresponding to - * a specific library to adapt to changes and easy to expand. - */ - -static inline void* GetDsoHandleFromSearchPath( - const std::string& config_path, const std::string& dso_name, - bool throw_on_error = true, - const std::vector& extra_paths = std::vector(), - const std::string& warning_msg = std::string()) { -#if !defined(_WIN32) - int dynload_flags = RTLD_LAZY | RTLD_LOCAL; -#else - int dynload_flags = 0; -#endif // !_WIN32 - std::vector dso_names = split(dso_name, ";"); - void* dso_handle = nullptr; - for (auto dso : dso_names) { - // 1. search in user config path by FLAGS - dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags); - // 2. search in system default path - if (nullptr == dso_handle) { - dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags); - } - // 3. search in extra paths - if (nullptr == dso_handle) { - for (auto path : extra_paths) { - VLOG(3) << "extra_paths: " << path; - dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags); - } - } - if (nullptr != dso_handle) break; - } +void* GetCublasDsoHandle() { return pten::dynload::GetCublasDsoHandle(); } - // 4. [If Failed for All dso_names] logging warning if exists - if (nullptr == dso_handle && !warning_msg.empty()) { - LOG(WARNING) << warning_msg; - } +void* GetCublasLtDsoHandle() { return pten::dynload::GetCublasLtDsoHandle(); } - // 5. [If Failed for All dso_names] logging or throw error info - if (nullptr == dso_handle) { - auto error_msg = - "The third-party dynamic library (%s) that Paddle depends on is not " - "configured correctly. (error code is %s)\n" - " Suggestions:\n" - " 1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) " - "is installed correctly and its version is matched with paddlepaddle " - "you installed.\n" - " 2. Configure third-party dynamic library environment variables as " - "follows:\n" - " - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n" - " - Windows: set PATH by `set PATH=XXX;%PATH%`\n" - " - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` " - "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is " - "impossible unless System Integrity Protection (SIP) is disabled.]"; -#if !defined(_WIN32) - auto errorno = dlerror(); -#else - auto errorno = GetLastError(); -#endif // !_WIN32 - if (throw_on_error) { - // NOTE: Special error report case, no need to change its format - PADDLE_THROW( - platform::errors::PreconditionNotMet(error_msg, dso_name, errorno)); - } else { - LOG(WARNING) << string::Sprintf(error_msg, dso_name, errorno); - } - } +void* GetCUDNNDsoHandle() { return pten::dynload::GetCUDNNDsoHandle(); } - return dso_handle; -} +void* GetCUPTIDsoHandle() { return pten::dynload::GetCUPTIDsoHandle(); } -void* GetCublasDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true, - {cuda_lib_path}); -#elif defined(PADDLE_WITH_HIP) - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); -#endif -} - -void* GetCublasLtDsoHandle() { -// APIs available after CUDA 10.1 -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100 - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so"); -#else - std::string warning_msg( - "Your CUDA_VERSION less 10.1, not support CublasLt. " - "If you want to use CublasLt, please upgrade CUDA and rebuild " - "PaddlePaddle."); - return nullptr; -#endif -} - -void* GetCUDNNDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - std::string mac_warn_meg( - "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " - "For instance, sudo tar -xzf " - "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " - "chmod a+r /usr/local/cuda/include/cudnn.h " - "/usr/local/cuda/lib/libcudnn*"); - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false, - {}, mac_warn_meg); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - std::string win_warn_meg( - "Note: [Recommend] copy cudnn into CUDA installation directory. \n " - "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from " - "NVIDIA's official website, \n" - "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing " - "Toolkit\\CUDA\\v10.0\n" - "You should do this according to your CUDA installation directory and " - "CUDNN version."); - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true, - {cuda_lib_path}, win_warn_meg); -#elif defined(PADDLE_WITH_HIP) - return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); -#else - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false, - {cuda_lib_path}); -#endif -} - -void* GetCUPTIDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false, - {cupti_lib_path}); -#else - return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.so", false, - {cupti_lib_path}); -#endif -} - -void* GetCurandDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true, - {cuda_lib_path}); -#elif defined(PADDLE_WITH_HIP) - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); -#endif -} +void* GetCurandDsoHandle() { return pten::dynload::GetCurandDsoHandle(); } #ifdef PADDLE_WITH_HIP -void* GetROCFFTDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib"); -#else - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so"); -#endif -} +void* GetROCFFTDsoHandle() { return pten::dynload::GetROCFFTDsoHandle(); } #endif -void* GetNvjpegDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_nvjpeg_lib, true, - {cuda_lib_path}); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so"); -#endif -} +void* GetNvjpegDsoHandle() { return pten::dynload::GetNvjpegDsoHandle(); } -void* GetCusolverDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusolver_lib, true, - {cuda_lib_path}); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so"); -#endif -} +void* GetCusolverDsoHandle() { return pten::dynload::GetCusolverDsoHandle(); } -void* GetCusparseDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusparse_lib, true, - {cuda_lib_path}); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so"); -#endif -} +void* GetCusparseDsoHandle() { return pten::dynload::GetCusparseDsoHandle(); } -void* GetNVRTCDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); -#elif defined(PADDLE_WITH_HIP) - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); -#endif -} +void* GetNVRTCDsoHandle() { return pten::dynload::GetNVRTCDsoHandle(); } -void* GetCUDADsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); -#elif defined(PADDLE_WITH_HIP) - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); -#elif defined(_WIN32) - char system32_dir[MAX_PATH]; - GetSystemDirectory(system32_dir, MAX_PATH); - return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll"); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false); -#endif -} +void* GetCUDADsoHandle() { return pten::dynload::GetCUDADsoHandle(); } -void* GetWarpCTCDsoHandle() { - std::string warpctc_dir = ""; - if (!s_py_site_pkg_path.path.empty()) { - warpctc_dir = s_py_site_pkg_path.path; - } -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll"); -#else - return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so"); -#endif -} +void* GetWarpCTCDsoHandle() { return pten::dynload::GetWarpCTCDsoHandle(); } -void* GetNCCLDsoHandle() { -#ifdef PADDLE_WITH_HIP - std::string warning_msg( - "You may need to install 'rccl' from ROCM official website: " - "https://rocmdocs.amd.com/en/latest/Installation_Guide/" - "Installation-Guide.html before install PaddlePaddle."); -#else - std::string warning_msg( - "You may need to install 'nccl2' from NVIDIA official website: " - "https://developer.nvidia.com/nccl/nccl-download" - "before install PaddlePaddle."); -#endif +void* GetNCCLDsoHandle() { return pten::dynload::GetNCCLDsoHandle(); } +void* GetHCCLDsoHandle() { return pten::dynload::GetHCCLDsoHandle(); } -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {}, - warning_msg); -#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) - return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true, {}, - warning_msg); -#else - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {}, - warning_msg); -#endif -} -void* GetHCCLDsoHandle() { - std::string warning_msg( - "You may need to install 'hccl2' from Huawei official website: " - "before install PaddlePaddle."); -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {}, - warning_msg); -#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) - return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true); +void* GetTensorRtDsoHandle() { return pten::dynload::GetTensorRtDsoHandle(); } -#elif defined(PADDLE_WITH_ASCEND_CL) - return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {}, - warning_msg); -#else - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {}, - warning_msg); -#endif -} +void* GetMKLMLDsoHandle() { return pten::dynload::GetMKLMLDsoHandle(); } -void* GetTensorRtDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll"); -#else - return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so"); -#endif -} - -void* GetMKLMLDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); -#else - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); -#endif -} - -void* GetLAPACKDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll"); -#else - return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3"); -#endif -} +void* GetLAPACKDsoHandle() { return pten::dynload::GetLAPACKDsoHandle(); } void* GetOpDsoHandle(const std::string& dso_name) { - return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name); + return pten::dynload::GetOpDsoHandle(dso_name); } -void* GetNvtxDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple.")); -#elif defined(_WIN32) - PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows.")); -#elif !defined(PADDLE_WITH_CUDA) - PADDLE_THROW( - platform::errors::Unimplemented("Nvtx do not support without CUDA.")); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so"); -#endif -} +void* GetNvtxDsoHandle() { return pten::dynload::GetNvtxDsoHandle(); } -void* GetCUFFTDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cufft_lib, true, - {cuda_lib_path}); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so"); -#endif -} +void* GetCUFFTDsoHandle() { return pten::dynload::GetCUFFTDsoHandle(); } -void* GetMKLRTDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll"); -#else - return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so"); -#endif -} +void* GetMKLRTDsoHandle() { return pten::dynload::GetMKLRTDsoHandle(); } } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/hipfft.cc b/paddle/fluid/platform/dynload/hipfft.cc index 767d2161be9..0da4758e6d5 100644 --- a/paddle/fluid/platform/dynload/hipfft.cc +++ b/paddle/fluid/platform/dynload/hipfft.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag hipfft_dso_flag; -void *hipfft_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/hipfft.h b/paddle/fluid/platform/dynload/hipfft.h index 50c25935e41..356b6c48a64 100644 --- a/paddle/fluid/platform/dynload/hipfft.h +++ b/paddle/fluid/platform/dynload/hipfft.h @@ -17,8 +17,7 @@ limitations under the License. */ #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/hipfft.h" namespace paddle { namespace platform { @@ -26,18 +25,8 @@ namespace dynload { extern std::once_flag hipfft_dso_flag; extern void *hipfft_dso_handle; -#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using hipfftFunc = decltype(&::__name); \ - std::call_once(hipfft_dso_flag, []() { \ - hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define HIPFFT_FFT_ROUTINE_EACH(__macro) \ @@ -70,53 +59,8 @@ extern void *hipfft_dso_handle; __macro(hipfftGetVersion); \ __macro(hipfftGetProperty); -HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP); +HIPFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP); -inline const char *hipfftGetErrorString(hipfftResult_t status) { - switch (status) { - case HIPFFT_SUCCESS: - return "'HIPFFT_SUCCESS'. The hipFFT operation was successful."; - case HIPFFT_INVALID_PLAN: - return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle."; - case HIPFFT_ALLOC_FAILED: - return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU " - "memory."; - case HIPFFT_INVALID_TYPE: - return "'HIPFFT_INVALID_TYPE'. No longer used."; - case HIPFFT_INVALID_VALUE: - return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or " - "parameter."; - case HIPFFT_INTERNAL_ERROR: - return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library " - "error."; - case HIPFFT_EXEC_FAILED: - return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU."; - case HIPFFT_SETUP_FAILED: - return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize."; - case HIPFFT_INVALID_SIZE: - return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size."; - case HIPFFT_UNALIGNED_DATA: - return "'HIPFFT_UNALIGNED_DATA'. No longer used."; - case HIPFFT_INCOMPLETE_PARAMETER_LIST: - return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call."; - case HIPFFT_INVALID_DEVICE: - return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different " - "GPU than plan creation."; - case HIPFFT_PARSE_ERROR: - return "'HIPFFT_PARSE_ERROR'. Internal plan database error."; - case HIPFFT_NO_WORKSPACE: - return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to " - "plan execution."; - case HIPFFT_NOT_IMPLEMENTED: - return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement " - "functionality for parameters given."; - case HIPFFT_NOT_SUPPORTED: - return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for " - "parameters given."; - default: - return "HIPFFT_STATUS_UNKNOWN_ERROR"; - } -} } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/hiprand.cc b/paddle/fluid/platform/dynload/hiprand.cc index 4fb26d0f9c8..4ad4eb8e41a 100644 --- a/paddle/fluid/platform/dynload/hiprand.cc +++ b/paddle/fluid/platform/dynload/hiprand.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag hiprand_dso_flag; -void *hiprand_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/hiprand.h b/paddle/fluid/platform/dynload/hiprand.h index 496e70bb26d..4d175b00c70 100644 --- a/paddle/fluid/platform/dynload/hiprand.h +++ b/paddle/fluid/platform/dynload/hiprand.h @@ -16,28 +16,15 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/hiprand.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag hiprand_dso_flag; -extern void *hiprand_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - hiprandStatus_t operator()(Args... args) { \ - using hiprandFunc = decltype(&::__name); \ - std::call_once(hiprand_dso_flag, []() { \ - hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define HIPRAND_RAND_ROUTINE_EACH(__macro) \ @@ -49,7 +36,7 @@ extern void *hiprand_dso_handle; __macro(hiprandGenerateNormal); \ __macro(hiprandDestroyGenerator); -HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); +HIPRAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc index 86a39d08eaa..ce4f915a9a4 100644 --- a/paddle/fluid/platform/dynload/hiprtc.cc +++ b/paddle/fluid/platform/dynload/hiprtc.cc @@ -13,23 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/hiprtc.h" +#include "paddle/pten/backends/dynload/hiprtc.h" namespace paddle { namespace platform { namespace dynload { -std::once_flag hiprtc_dso_flag; -void* hiprtc_dso_handle = nullptr; - #define DEFINE_WRAP(__name) DynLoad__##__name __name HIPRTC_ROUTINE_EACH(DEFINE_WRAP); -bool HasNVRTC() { - std::call_once(hiprtc_dso_flag, - []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); }); - return hiprtc_dso_handle != nullptr; -} +bool HasNVRTC() { return pten::dynload::HasNVRTC(); } } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h index 4b376f1858f..f2bb56ace69 100644 --- a/paddle/fluid/platform/dynload/hiprtc.h +++ b/paddle/fluid/platform/dynload/hiprtc.h @@ -16,30 +16,17 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/hiprtc.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag hiprtc_dso_flag; -extern void* hiprtc_dso_handle; extern bool HasNVRTC(); -#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using hiprtc_func = decltype(&::__name); \ - std::call_once(hiprtc_dso_flag, []() { \ - hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name +#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name /** * include all needed hiprtc functions @@ -55,9 +42,9 @@ extern bool HasNVRTC(); __macro(hiprtcGetProgramLog); \ __macro(hiprtcGetProgramLogSize) -HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP); +HIPRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP); -#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/lapack.cc b/paddle/fluid/platform/dynload/lapack.cc index eeebe240874..5a21bb4d041 100644 --- a/paddle/fluid/platform/dynload/lapack.cc +++ b/paddle/fluid/platform/dynload/lapack.cc @@ -13,15 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/lapack.h" -#include namespace paddle { namespace platform { namespace dynload { -std::once_flag lapack_dso_flag; -void* lapack_dso_handle = nullptr; - #define DEFINE_WRAP(__name) DynLoad__##__name __name LAPACK_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/lapack.h b/paddle/fluid/platform/dynload/lapack.h index ce24b98defb..4a55237e3ac 100644 --- a/paddle/fluid/platform/dynload/lapack.h +++ b/paddle/fluid/platform/dynload/lapack.h @@ -16,122 +16,20 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" - -// Note(zhouwei): because lapack doesn't provide appropriate header file. -// should expose API statement yourself. - -// getrf_(For example) -extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, - int *info); -extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, - int *info); - -// evd -extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex *a, - int *lda, double *w, std::complex *work, - int *lwork, double *rwork, int *lrwork, int *iwork, - int *liwork, int *info); -extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex *a, - int *lda, float *w, std::complex *work, - int *lwork, float *rwork, int *lrwork, int *iwork, - int *liwork, int *info); -extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda, - double *w, double *work, int *lwork, int *iwork, - int *liwork, int *info); -extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda, - float *w, float *work, int *lwork, int *iwork, - int *liwork, int *info); - -// geev -extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, - double *wr, double *wi, double *vl, int *ldvl, - double *vr, int *ldvr, double *work, int *lwork, - int *info); -extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, - float *wr, float *wi, float *vl, int *ldvl, float *vr, - int *ldvr, float *work, int *lwork, int *info); -extern "C" void zgeev_(char *jobvl, char *jobvr, int *n, - std::complex *a, int *lda, - std::complex *w, std::complex *vl, - int *ldvl, std::complex *vr, int *ldvr, - std::complex *work, int *lwork, double *rwork, - int *info); -extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex *a, - int *lda, std::complex *w, - std::complex *vl, int *ldvl, - std::complex *vr, int *ldvr, - std::complex *work, int *lwork, float *rwork, - int *info); - -// gels -extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, - int *lda, double *b, int *ldb, double *work, int *lwork, - int *info); -extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, - int *lda, float *b, int *ldb, float *work, int *lwork, - int *info); - -// gelsd -extern "C" void dgelsd_(int *m, int *n, int *nrhs, double *a, int *lda, - double *b, int *ldb, double *s, double *rcond, - int *rank, double *work, int *lwork, int *iwork, - int *info); -extern "C" void sgelsd_(int *m, int *n, int *nrhs, float *a, int *lda, float *b, - int *ldb, float *s, float *rcond, int *rank, - float *work, int *lwork, int *iwork, int *info); - -// gelsy -extern "C" void dgelsy_(int *m, int *n, int *nrhs, double *a, int *lda, - double *b, int *ldb, int *jpvt, double *rcond, - int *rank, double *work, int *lwork, int *info); -extern "C" void sgelsy_(int *m, int *n, int *nrhs, float *a, int *lda, float *b, - int *ldb, int *jpvt, float *rcond, int *rank, - float *work, int *lwork, int *info); - -// gelss -extern "C" void dgelss_(int *m, int *n, int *nrhs, double *a, int *lda, - double *b, int *ldb, double *s, double *rcond, - int *rank, double *work, int *lwork, int *info); -extern "C" void sgelss_(int *m, int *n, int *nrhs, float *a, int *lda, float *b, - int *ldb, float *s, float *rcond, int *rank, - float *work, int *lwork, int *info); - -extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex *a, - int *lda, std::complex *b, int *ldb, int *info); -extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex *a, - int *lda, std::complex *b, int *ldb, int *info); -extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, - double *b, int *ldb, int *info); -extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, - float *b, int *ldb, int *info); +#include "paddle/pten/backends/dynload/lapack.h" +#include "paddle/pten/common/complex.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag lapack_dso_flag; -extern void *lapack_dso_handle; - /** * The following macro definition can generate structs * (for each function) to dynamic load lapack routine * via operator overloading. */ -#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using lapackFunc = decltype(&::__name); \ - std::call_once(lapack_dso_flag, []() { \ - lapack_dso_handle = paddle::platform::dynload::GetLAPACKDsoHandle(); \ - }); \ - static void *p_##_name = dlsym(lapack_dso_handle, #__name); \ - return reinterpret_cast(p_##_name)(args...); \ - } \ - }; \ +#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \ diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc index 1b4bdd2939f..5390bdc6c56 100644 --- a/paddle/fluid/platform/dynload/miopen.cc +++ b/paddle/fluid/platform/dynload/miopen.cc @@ -13,13 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/miopen.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/backends/dynload/cudnn.h" namespace paddle { namespace platform { namespace dynload { -std::once_flag miopen_dso_flag; -void* miopen_dso_handle = nullptr; #define DEFINE_WRAP(__name) DynLoad__##__name __name @@ -50,19 +48,7 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); #endif -bool HasCUDNN() { - std::call_once(miopen_dso_flag, - []() { miopen_dso_handle = GetCUDNNDsoHandle(); }); - return miopen_dso_handle != nullptr; -} - -void EnforceCUDNNLoaded(const char* fn_name) { - PADDLE_ENFORCE_NOT_NULL( - miopen_dso_handle, - platform::errors::PreconditionNotMet( - "Cannot load miopen shared library. Cannot invoke method %s.", - fn_name)); -} +bool HasCUDNN() { return pten::dynload::HasCUDNN(); } } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h index 34845f24ff5..6f553272710 100644 --- a/paddle/fluid/platform/dynload/miopen.h +++ b/paddle/fluid/platform/dynload/miopen.h @@ -18,66 +18,17 @@ limitations under the License. */ #include #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" - -#define MIOPEN_VERSION \ - (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \ - MIOPEN_VERSION_PATCH) // NOLINT - -// MIOPEN only support NCHW, just for compatibility with CUDNN API -typedef enum { - MIOPEN_TENSOR_NCHW = 0, - MIOPEN_TENSOR_NHWC = 1, -} miopenTensorFormat_t; +#include "paddle/pten/backends/dynload/miopen.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag miopen_dso_flag; -extern void* miopen_dso_handle; extern bool HasCUDNN(); -inline const char* miopenGetErrorString(miopenStatus_t status) { - switch (status) { - case miopenStatusSuccess: - return "MIOPEN_STATUS_SUCCESS"; - case miopenStatusNotInitialized: - return "MIOPEN_STATUS_NOT_INITIALIZED"; - case miopenStatusInvalidValue: - return "MIOPEN_STATUS_INVALID_VALUE"; - case miopenStatusBadParm: - return "MIOPEN_STATUS_BAD_PARAM"; - case miopenStatusAllocFailed: - return "MIOPEN_STATUS_ALLOC_FAILED"; - case miopenStatusInternalError: - return "MIOPEN_STATUS_INTERNAL_ERROR"; - case miopenStatusNotImplemented: - return "MIOPEN_STATUS_NOT_IMPLEMENTED"; - case miopenStatusUnsupportedOp: - return "MIOPEN_STATUS_UNSUPPORTED_OP"; - case miopenStatusUnknownError: - default: - return "MIOPEN_STATUS_UNKNOWN_ERROR"; - } -} - -extern void EnforceCUDNNLoaded(const char* fn_name); -#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using miopen_func = decltype(&::__name); \ - std::call_once(miopen_dso_flag, []() { \ - miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ - }); \ - EnforceCUDNNLoaded(#__name); \ - static void* p_##__name = dlsym(miopen_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name /** * include all needed miopen functions in HPPL @@ -145,23 +96,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(miopenRNNForwardInference); \ __macro(miopenGetTensorNumBytes); -MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +MIOPEN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) #define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \ __macro(miopenConvolutionBackwardData); -MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) // APIs available after R3: #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize); -MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) // APIs available after R4: #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ __macro(miopenBatchNormalizationForwardTraining); \ __macro(miopenBatchNormalizationForwardInference); \ __macro(miopenBatchNormalizationBackward); -MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) // APIs in R5 #define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \ @@ -169,12 +120,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) __macro(miopenSetActivationDescriptor); \ __macro(miopenGetActivationDescriptor); \ __macro(miopenDestroyActivationDescriptor); -MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_R5(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) // APIs in R6 #define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \ /*__macro(miopenSetRNNDescriptor_v6);*/ -MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_R6(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) #define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \ __macro(miopenSetConvolutionGroupCount); \ @@ -184,7 +135,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) __macro(miopenSetCTCLossDescriptor); \ __macro(miopenGetCTCLossWorkspaceSize); \ __macro(miopenCTCLoss); -MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \ /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \ @@ -192,7 +143,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx); \ __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ __macro(cudnnBatchNormalizationBackwardEx); \ __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/ -MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc index 020c02d9baa..ff475b2312c 100644 --- a/paddle/fluid/platform/dynload/mklml.cc +++ b/paddle/fluid/platform/dynload/mklml.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag mklml_dso_flag; -void* mklml_dso_handle = nullptr; - #define DEFINE_WRAP(__name) DynLoad__##__name __name MKLML_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index 335b919f41c..bd7d40eca3f 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -17,36 +17,23 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/mklml.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag mklml_dso_flag; -extern void *mklml_dso_handle; - /** * The following macro definition can generate structs * (for each function) to dynamic load mklml routine * via operator overloading. */ -#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using mklmlFunc = decltype(&::__name); \ - std::call_once(mklml_dso_flag, []() { \ - mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ - }); \ - static void *p_##_name = dlsym(mklml_dso_handle, #__name); \ - return reinterpret_cast(p_##_name)(args...); \ - } \ - }; \ +#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name -#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name) +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \ + DYNAMIC_LOAD_MKLML_WRAP(__name) #define MKLML_ROUTINE_EACH(__macro) \ __macro(cblas_sgemm); \ @@ -111,7 +98,7 @@ extern void *mklml_dso_handle; __macro(MKL_Set_Num_Threads); \ __macro(MKL_Get_Max_Threads); -MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); +MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP); #if !defined(_WIN32) DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm); diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h index 423cd4d0a25..c01d52b9780 100644 --- a/paddle/fluid/platform/dynload/mklrt.h +++ b/paddle/fluid/platform/dynload/mklrt.h @@ -18,7 +18,7 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" namespace paddle { namespace platform { @@ -32,18 +32,8 @@ extern void* mklrt_dso_handle; * (for each function) to dynamic load mkldfti routine * via operator overloading. */ -#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using mklrtFunc = decltype(&::__name); \ - std::call_once(mklrt_dso_flag, []() { \ - mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(mklrt_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name // mkl_dfti.h has a macro that shadows the function with the same name diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc index 8f917e4904f..7b0ea3bb7f3 100644 --- a/paddle/fluid/platform/dynload/nccl.cc +++ b/paddle/fluid/platform/dynload/nccl.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag nccl_dso_flag; -void *nccl_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index f0679b2bce1..318948a1b29 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -16,28 +16,14 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/nccl.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag nccl_dso_flag; -extern void* nccl_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using nccl_func = decltype(&::__name); \ - std::call_once(nccl_dso_flag, []() { \ - nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(nccl_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define NCCL_RAND_ROUTINE_EACH(__macro) \ @@ -57,30 +43,30 @@ extern void* nccl_dso_handle; __macro(ncclReduceScatter); \ __macro(ncclGetErrorString); -NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #if NCCL_VERSION_CODE >= 2212 #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast); -NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2304 #define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); -NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2703 #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ __macro(ncclSend); \ __macro(ncclRecv); -NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 21100 #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ __macro(ncclRedOpCreatePreMulSum); \ __macro(ncclRedOpDestroy); -NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP) #endif } // namespace dynload diff --git a/paddle/fluid/platform/dynload/nvjpeg.cc b/paddle/fluid/platform/dynload/nvjpeg.cc index eb0ad78b9b7..006efd29121 100644 --- a/paddle/fluid/platform/dynload/nvjpeg.cc +++ b/paddle/fluid/platform/dynload/nvjpeg.cc @@ -15,9 +15,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag nvjpeg_dso_flag; -void *nvjpeg_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h index ae457b2958f..0e137173e4a 100644 --- a/paddle/fluid/platform/dynload/nvjpeg.h +++ b/paddle/fluid/platform/dynload/nvjpeg.h @@ -14,27 +14,14 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/nvjpeg.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag nvjpeg_dso_flag; -extern void *nvjpeg_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - nvjpegStatus_t operator()(Args... args) { \ - using nvjpegFunc = decltype(&::__name); \ - std::call_once(nvjpeg_dso_flag, []() { \ - nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define NVJPEG_RAND_ROUTINE_EACH(__macro) \ @@ -44,7 +31,7 @@ extern void *nvjpeg_dso_handle; __macro(nvjpegJpegStateDestroy); \ __macro(nvjpegDecode); -NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); +NVJPEG_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc index 74dfa5b3c22..a07613b9bf4 100644 --- a/paddle/fluid/platform/dynload/nvrtc.cc +++ b/paddle/fluid/platform/dynload/nvrtc.cc @@ -13,23 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/nvrtc.h" +#include "paddle/pten/backends/dynload/nvrtc.h" namespace paddle { namespace platform { namespace dynload { -std::once_flag nvrtc_dso_flag; -void* nvrtc_dso_handle = nullptr; - #define DEFINE_WRAP(__name) DynLoad__##__name __name NVRTC_ROUTINE_EACH(DEFINE_WRAP); -bool HasNVRTC() { - std::call_once(nvrtc_dso_flag, - []() { nvrtc_dso_handle = GetNVRTCDsoHandle(); }); - return nvrtc_dso_handle != nullptr; -} +bool HasNVRTC() { return pten::dynload::HasNVRTC(); } } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h index 720450d28b1..b71d0b0231c 100644 --- a/paddle/fluid/platform/dynload/nvrtc.h +++ b/paddle/fluid/platform/dynload/nvrtc.h @@ -17,30 +17,17 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/nvrtc.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag nvrtc_dso_flag; -extern void* nvrtc_dso_handle; extern bool HasNVRTC(); -#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using nvrtc_func = decltype(&::__name); \ - std::call_once(nvrtc_dso_flag, []() { \ - nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(nvrtc_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name +#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name /** * include all needed nvrtc functions @@ -56,9 +43,9 @@ extern bool HasNVRTC(); __macro(nvrtcGetProgramLog); \ __macro(nvrtcGetProgramLogSize) -NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); +NVRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); -#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/nvtx.cc b/paddle/fluid/platform/dynload/nvtx.cc index 372f8500e54..29683b2f2d4 100644 --- a/paddle/fluid/platform/dynload/nvtx.cc +++ b/paddle/fluid/platform/dynload/nvtx.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag nvtx_dso_flag; -void *nvtx_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name NVTX_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h index b696bbf9181..64782612379 100644 --- a/paddle/fluid/platform/dynload/nvtx.h +++ b/paddle/fluid/platform/dynload/nvtx.h @@ -17,36 +17,23 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/nvtx.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag nvtx_dso_flag; -extern void *nvtx_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - int operator()(Args... args) { \ - using nvtxFunc = decltype(&::__name); \ - std::call_once(nvtx_dso_flag, []() { \ - nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define NVTX_ROUTINE_EACH(__macro) \ __macro(nvtxRangePushA); \ __macro(nvtxRangePop); -NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP); +NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP); -#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc index e19c22ba6d9..82838da685b 100644 --- a/paddle/fluid/platform/dynload/rccl.cc +++ b/paddle/fluid/platform/dynload/rccl.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag rccl_dso_flag; -void *rccl_dso_handle; - #define DEFINE_WRAP(__name) DynLoad__##__name __name RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h index ac9ab657d5e..5512756028e 100644 --- a/paddle/fluid/platform/dynload/rccl.h +++ b/paddle/fluid/platform/dynload/rccl.h @@ -16,28 +16,14 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/rccl.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag rccl_dso_flag; -extern void* rccl_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using nccl_func = decltype(&::__name); \ - std::call_once(rccl_dso_flag, []() { \ - rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(rccl_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define RCCL_RAND_ROUTINE_EACH(__macro) \ @@ -57,18 +43,18 @@ extern void* rccl_dso_handle; __macro(ncclReduceScatter); \ __macro(ncclGetErrorString); -RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) +RCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #if NCCL_VERSION_CODE >= 2212 #define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast); -RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) +RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif #if NCCL_VERSION_CODE >= 2703 #define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ __macro(ncclSend); \ __macro(ncclRecv); -RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) +RCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP) #endif } // namespace dynload diff --git a/paddle/fluid/platform/dynload/rocblas.cc b/paddle/fluid/platform/dynload/rocblas.cc index ee774195363..771989123c7 100644 --- a/paddle/fluid/platform/dynload/rocblas.cc +++ b/paddle/fluid/platform/dynload/rocblas.cc @@ -17,8 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { namespace dynload { -std::once_flag rocblas_dso_flag; -void *rocblas_dso_handle = nullptr; #define DEFINE_WRAP(__name) DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h index 45614f2209f..a73bd61bda7 100644 --- a/paddle/fluid/platform/dynload/rocblas.h +++ b/paddle/fluid/platform/dynload/rocblas.h @@ -19,16 +19,12 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/rocblas.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag rocblas_dso_flag; -extern void *rocblas_dso_handle; - /** * The following macro definition can generate structs * (for each function) to dynamic load cublas routine @@ -36,18 +32,8 @@ extern void *rocblas_dso_handle; * * note: default dynamic linked libs */ -#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - rocblas_status operator()(Args... args) { \ - using rocblas_func = decltype(&::__name); \ - std::call_once(rocblas_dso_flag, []() { \ - rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \ @@ -83,7 +69,7 @@ extern void *rocblas_dso_handle; __macro(rocblas_set_pointer_mode); \ __macro(rocblas_get_pointer_mode); -ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) +ROCBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) // APIs available after CUDA 8.0 #define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ @@ -94,21 +80,21 @@ ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) __macro(rocblas_zgemm_strided_batched); \ __macro(rocblas_hgemm_strided_batched); -ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) +ROCBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) // HIP not supported in ROCM3.5 // #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro) // __macro(cublasSetMathMode); // __macro(cublasGetMathMode); -// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) +// ROCBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ __macro(rocblas_gemm_batched_ex); \ __macro(rocblas_gemm_strided_batched_ex); -ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) +ROCBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) -#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc index 9ec123b632f..46542066592 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.cc +++ b/paddle/fluid/platform/dynload/rocm_driver.cc @@ -13,22 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/dynload/rocm_driver.h" +#include "paddle/pten/backends/dynload/rocm_driver.h" namespace paddle { namespace platform { namespace dynload { -std::once_flag rocm_dso_flag; -void* rocm_dso_handle = nullptr; - #define DEFINE_WRAP(__name) DynLoad__##__name __name ROCM_ROUTINE_EACH(DEFINE_WRAP); -bool HasCUDADriver() { - std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); }); - return rocm_dso_handle != nullptr; -} +bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); } } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h index 4527b6d6e44..2556ca3b337 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.h +++ b/paddle/fluid/platform/dynload/rocm_driver.h @@ -17,30 +17,17 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/rocm_driver.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag rocm_dso_flag; -extern void* rocm_dso_handle; extern bool HasCUDADriver(); -#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using rocm_func = decltype(&::__name); \ - std::call_once(rocm_dso_flag, []() { \ - rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \ - }); \ - static void* p_##__name = dlsym(rocm_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern struct DynLoad__##__name __name +#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name /** * include all needed cuda driver functions @@ -59,9 +46,9 @@ extern bool HasCUDADriver(); __macro(hipGetDeviceCount); \ __macro(hipDevicePrimaryCtxGetState) -ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP); +ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP); -#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/warpctc.cc b/paddle/fluid/platform/dynload/warpctc.cc index 4a150048959..48c78a13073 100644 --- a/paddle/fluid/platform/dynload/warpctc.cc +++ b/paddle/fluid/platform/dynload/warpctc.cc @@ -18,9 +18,6 @@ namespace paddle { namespace platform { namespace dynload { -std::once_flag warpctc_dso_flag; -void* warpctc_dso_handle = nullptr; - #define DEFINE_WRAP(__name) DynLoad__##__name __name WARPCTC_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index 5f1b7612117..ea5adefa111 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -16,34 +16,19 @@ limitations under the License. */ #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" -#include "warpctc/include/ctc.h" +#include "paddle/pten/backends/dynload/warpctc.h" namespace paddle { namespace platform { namespace dynload { -extern std::once_flag warpctc_dso_flag; -extern void* warpctc_dso_handle; - /** * The following macro definition can generate structs * (for each function) to dynamic load warpctc routine * via operator overloading. */ -#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using warpctcFunc = decltype(&::__name); \ - std::call_once(warpctc_dso_flag, []() { \ - warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ - }); \ - static void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ - return reinterpret_cast(p_##_name)(args...); \ - } \ - }; \ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + using DynLoad__##__name = pten::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 30930897ea8..32f233e44e9 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -65,30 +65,30 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/variant.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/to_string.h" +#include "paddle/pten/backends/dynload/port.h" #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/dynload/cublas.h" -#include "paddle/fluid/platform/dynload/cudnn.h" -#include "paddle/fluid/platform/dynload/curand.h" -#include "paddle/fluid/platform/dynload/cusolver.h" +#include "paddle/pten/backends/dynload/cublas.h" +#include "paddle/pten/backends/dynload/cudnn.h" +#include "paddle/pten/backends/dynload/curand.h" +#include "paddle/pten/backends/dynload/cusolver.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include -#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/pten/backends/dynload/nccl.h" #endif // __APPLE__ #endif // PADDLE_WITH_CUDA #ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/dynload/hipfft.h" -#include "paddle/fluid/platform/dynload/hiprand.h" -#include "paddle/fluid/platform/dynload/miopen.h" -#include "paddle/fluid/platform/dynload/rocblas.h" +#include "paddle/pten/backends/dynload/hipfft.h" +#include "paddle/pten/backends/dynload/hiprand.h" +#include "paddle/pten/backends/dynload/miopen.h" +#include "paddle/pten/backends/dynload/rocblas.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #include // NOLINT -#include "paddle/fluid/platform/dynload/rccl.h" +#include "paddle/pten/backends/dynload/rccl.h" #endif // __APPLE__ #endif // PADDLE_WITH_HIP @@ -880,7 +880,7 @@ inline bool is_error(cudnnStatus_t stat) { inline std::string build_nvidia_error_msg(cudnnStatus_t stat) { std::ostringstream sout; sout << "CUDNN error(" << stat << "), " - << platform::dynload::cudnnGetErrorString(stat) << ". " + << pten::dynload::cudnnGetErrorString(stat) << ". " << GetExternalErrorMsg(stat); return sout.str(); } @@ -945,7 +945,7 @@ inline bool is_error(ncclResult_t nccl_result) { inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { std::ostringstream sout; sout << "NCCL error(" << nccl_result << "), " - << platform::dynload::ncclGetErrorString(nccl_result) << ". "; + << pten::dynload::ncclGetErrorString(nccl_result) << ". "; if (errno == ENOSPC || errno == EAGAIN) { std::string detail(strerror(errno)); detail += "\nPlease try one of the following solutions:"; @@ -1090,7 +1090,7 @@ inline bool is_error(miopenStatus_t stat) { inline std::string build_rocm_error_msg(miopenStatus_t stat) { std::string msg(" Miopen error, "); - return msg + platform::dynload::miopenGetErrorString(stat) + " "; + return msg + pten::dynload::miopenGetErrorString(stat) + " "; } /***** ROCBLAS ERROR *****/ @@ -1132,7 +1132,7 @@ inline bool is_error(ncclResult_t nccl_result) { inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { std::string msg(" Rccl error, "); - return msg + platform::dynload::ncclGetErrorString(nccl_result) + " "; + return msg + pten::dynload::ncclGetErrorString(nccl_result) + " "; } #endif // not(__APPLE__) and PADDLE_WITH_NCCL @@ -1141,7 +1141,7 @@ inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; } inline std::string build_rocm_error_msg(hipfftResult_t stat) { std::string msg(" HIPFFT error, "); - return msg + platform::dynload::hipfftGetErrorString(stat) + " "; + return msg + pten::dynload::hipfftGetErrorString(stat) + " "; } namespace details { diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc index 07263153164..92d218504ea 100644 --- a/paddle/fluid/platform/os_info.cc +++ b/paddle/fluid/platform/os_info.cc @@ -24,6 +24,8 @@ limitations under the License. */ #include #elif defined(_MSC_VER) #include +#else +#include #endif #include "paddle/fluid/platform/macros.h" // import DISABLE_COPY_AND_ASSIGN diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h index c84738247a4..d8e3b0524f4 100644 --- a/paddle/fluid/platform/os_info.h +++ b/paddle/fluid/platform/os_info.h @@ -19,7 +19,7 @@ limitations under the License. */ #ifdef _POSIX_C_SOURCE #include #endif -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h index 09dcc4369be..0054968e525 100644 --- a/paddle/fluid/platform/timer.h +++ b/paddle/fluid/platform/timer.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/platform/port.h" +#include "paddle/pten/backends/dynload/port.h" #ifdef _WIN32 static unsigned sleep(unsigned seconds) { diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt index e45adefe652..3587910ff50 100644 --- a/paddle/pten/backends/CMakeLists.txt +++ b/paddle/pten/backends/CMakeLists.txt @@ -1,2 +1,5 @@ +add_subdirectory(dynload) + add_subdirectory(cpu) + cc_library(pten_context SRCS all_context.cc DEPS device_context) diff --git a/paddle/pten/backends/dynload/CMakeLists.txt b/paddle/pten/backends/dynload/CMakeLists.txt new file mode 100644 index 00000000000..b7242fc76df --- /dev/null +++ b/paddle/pten/backends/dynload/CMakeLists.txt @@ -0,0 +1,57 @@ +cc_library(pten_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags) + +list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc) + +if (NOT WITH_NV_JETSON) + list(APPEND CUDA_SRCS nvjpeg.cc) +endif() + +if (WITH_ROCM) + list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc) +endif() + +# There is no macOS version of NCCL. +# Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows. +if (NOT APPLE) + list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) + if (WITH_NCCL) + list(APPEND CUDA_SRCS nccl.cc) + endif() + if (WITH_ROCM) + list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc) + if (WITH_RCCL) + list(APPEND HIP_SRCS rccl.cc) + endif() + endif() +endif() + +if (TENSORRT_FOUND) + list(APPEND CUDA_SRCS tensorrt.cc) +endif() + +configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) +if (CUPTI_FOUND) + list(APPEND CUDA_SRCS cupti.cc) +endif(CUPTI_FOUND) +if(WITH_ROCM) + hip_library(pten_dynload_cuda SRCS ${HIP_SRCS} DEPS pten_dynamic_loader) + cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc) +elseif (WITH_ASCEND_CL) + cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc npu_hccl) +else() + nv_library(pten_dynload_cuda SRCS ${CUDA_SRCS} DEPS pten_dynamic_loader) + cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc) +endif() +if (WITH_MKLML) + cc_library(pten_dynload_mklml SRCS mklml.cc DEPS pten_dynamic_loader mklml) +endif() + +cc_library(pten_dynload_lapack SRCS lapack.cc DEPS pten_dynamic_loader) +add_dependencies(pten_dynload_lapack extern_lapack) +# TODO(TJ): add iomp, mkldnn? + +if (MKL_FOUND AND WITH_ONEMKL) + message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}") + cc_library(pten_dynload_mklrt SRCS mklrt.cc DEPS pten_dynamic_loader) + target_include_directories(pten_dynload_mklrt PRIVATE ${MKL_INCLUDE}) +endif() diff --git a/paddle/pten/backends/dynload/cublas.cc b/paddle/pten/backends/dynload/cublas.cc new file mode 100644 index 00000000000..c1c819346ed --- /dev/null +++ b/paddle/pten/backends/dynload/cublas.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/cublas.h" + +namespace pten { +namespace dynload { +std::once_flag cublas_dso_flag; +void *cublas_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2 +CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif + +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3 +CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); +#endif + +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4 +CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cublas.h b/paddle/pten/backends/dynload/cublas.h new file mode 100644 index 00000000000..4748b40a247 --- /dev/null +++ b/paddle/pten/backends/dynload/cublas.h @@ -0,0 +1,136 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag cublas_dso_flag; +extern void *cublas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cublas_func = \ + decltype(::__name(std::declval()...)) (*)(Args...); \ + std::call_once(cublas_dso_flag, []() { \ + cublas_dso_handle = pten::dynload::GetCublasDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSaxpy_v2); \ + __macro(cublasDaxpy_v2); \ + __macro(cublasCaxpy_v2); \ + __macro(cublasZaxpy_v2); \ + __macro(cublasSscal_v2); \ + __macro(cublasDscal_v2); \ + __macro(cublasScopy_v2); \ + __macro(cublasDcopy_v2); \ + __macro(cublasSgemv_v2); \ + __macro(cublasDgemv_v2); \ + __macro(cublasCgemv_v2); \ + __macro(cublasZgemv_v2); \ + __macro(cublasSgemm_v2); \ + __macro(cublasDgemm_v2); \ + __macro(cublasCgemm_v2); \ + __macro(cublasZgemm_v2); \ + __macro(cublasHgemm); \ + __macro(cublasSgemmEx); \ + __macro(cublasSgeam); \ + __macro(cublasDgeam); \ + __macro(cublasStrsm_v2); \ + __macro(cublasDtrsm_v2); \ + __macro(cublasCtrsm_v2); \ + __macro(cublasZtrsm_v2); \ + __macro(cublasCreate_v2); \ + __macro(cublasDestroy_v2); \ + __macro(cublasSetStream_v2); \ + __macro(cublasSetPointerMode_v2); \ + __macro(cublasGetPointerMode_v2); \ + __macro(cublasSgemmBatched); \ + __macro(cublasDgemmBatched); \ + __macro(cublasCgemmBatched); \ + __macro(cublasZgemmBatched); \ + __macro(cublasStrsmBatched); \ + __macro(cublasDtrsmBatched); \ + __macro(cublasCtrsmBatched); \ + __macro(cublasZtrsmBatched); \ + __macro(cublasSgetrfBatched); \ + __macro(cublasSgetriBatched); \ + __macro(cublasDgetrfBatched); \ + __macro(cublasDgetriBatched); \ + __macro(cublasSmatinvBatched); \ + __macro(cublasDmatinvBatched); \ + __macro(cublasSgetrsBatched); \ + __macro(cublasDgetrsBatched); + +CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) + +// APIs available after CUDA 8.0 +#if CUDA_VERSION >= 8000 +#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ + __macro(cublasGemmEx); \ + __macro(cublasSgemmStridedBatched); \ + __macro(cublasDgemmStridedBatched); \ + __macro(cublasCgemmStridedBatched); \ + __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); + +CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +#endif + +// APIs available after CUDA 9.0 +#if CUDA_VERSION >= 9000 +#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \ + __macro(cublasSetMathMode); \ + __macro(cublasGetMathMode); + +CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +#endif + +// APIs available after CUDA 9.1 +#if CUDA_VERSION >= 9010 +#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ + __macro(cublasGemmBatchedEx); \ + __macro(cublasGemmStridedBatchedEx); + +CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +#endif + +#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cublasLt.cc b/paddle/pten/backends/dynload/cublasLt.cc new file mode 100644 index 00000000000..5d6ce6c7b95 --- /dev/null +++ b/paddle/pten/backends/dynload/cublasLt.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/cublasLt.h" + +namespace pten { +namespace dynload { +std::once_flag cublasLt_dso_flag; +void *cublasLt_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cublasLt.h b/paddle/pten/backends/dynload/cublasLt.h new file mode 100644 index 00000000000..68c23b30b66 --- /dev/null +++ b/paddle/pten/backends/dynload/cublasLt.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag cublasLt_dso_flag; +extern void *cublasLt_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublasLt routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cublasLt_func = \ + decltype(::__name(std::declval()...)) (*)(Args...); \ + std::call_once(cublasLt_dso_flag, []() { \ + cublasLt_dso_handle = pten::dynload::GetCublasLtDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +// APIs available after CUDA 10.1 +// #if CUDA_VERSION >= 10100 +#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasLtCreate); \ + __macro(cublasLtDestroy); \ + __macro(cublasLtMatmul); \ + __macro(cublasLtMatmulDescCreate); \ + __macro(cublasLtMatmulDescDestroy); \ + __macro(cublasLtMatmulDescSetAttribute); \ + __macro(cublasLtMatrixLayoutCreate); \ + __macro(cublasLtMatrixLayoutDestroy); \ + __macro(cublasLtMatrixLayoutSetAttribute); \ + __macro(cublasLtMatrixTransform); \ + __macro(cublasLtMatrixTransformDescCreate); \ + __macro(cublasLtMatrixTransformDescDestroy); \ + __macro(cublasLtMatrixTransformDescSetAttribute); + +CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP) +// #endif + +#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cuda_driver.cc b/paddle/pten/backends/dynload/cuda_driver.cc new file mode 100644 index 00000000000..ae72a6a5740 --- /dev/null +++ b/paddle/pten/backends/dynload/cuda_driver.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/cuda_driver.h" + +namespace pten { +namespace dynload { + +std::once_flag cuda_dso_flag; +void* cuda_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +#if CUDA_VERSION >= 10020 +CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP); +#endif +CUDA_ROUTINE_EACH(DEFINE_WRAP); + +bool HasCUDADriver() { + std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); }); + return cuda_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cuda_driver.h b/paddle/pten/backends/dynload/cuda_driver.h new file mode 100644 index 00000000000..6b91d7049a9 --- /dev/null +++ b/paddle/pten/backends/dynload/cuda_driver.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag cuda_dso_flag; +extern void* cuda_dso_handle; +extern bool HasCUDADriver(); + +#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cuda_func = decltype(&::__name); \ + std::call_once(cuda_dso_flag, []() { \ + cuda_dso_handle = pten::dynload::GetCUDADsoHandle(); \ + }); \ + static void* p_##__name = dlsym(cuda_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed cuda driver functions + **/ +#define CUDA_ROUTINE_EACH(__macro) \ + __macro(cuInit); \ + __macro(cuDriverGetVersion); \ + __macro(cuGetErrorString); \ + __macro(cuModuleLoadData); \ + __macro(cuModuleGetFunction); \ + __macro(cuModuleUnload); \ + __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor); \ + __macro(cuLaunchKernel); \ + __macro(cuCtxCreate); \ + __macro(cuCtxGetCurrent); \ + __macro(cuDeviceGetCount); \ + __macro(cuDevicePrimaryCtxGetState); \ + __macro(cuDeviceGetAttribute); \ + __macro(cuDeviceGet) + +#if CUDA_VERSION >= 10020 +#define CUDA_ROUTINE_EACH_VVM(__macro) \ + __macro(cuMemGetAllocationGranularity); \ + __macro(cuMemAddressReserve); \ + __macro(cuMemCreate); \ + __macro(cuMemMap); \ + __macro(cuMemSetAccess); \ + __macro(cuMemUnmap); \ + __macro(cuMemRelease); \ + __macro(cuMemAddressFree) + +CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); +#endif + +CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cudnn.cc b/paddle/pten/backends/dynload/cudnn.cc new file mode 100644 index 00000000000..67447e7359f --- /dev/null +++ b/paddle/pten/backends/dynload/cudnn.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/cudnn.h" +#include "paddle/fluid/platform/enforce.h" + +namespace pten { +namespace dynload { + +std::once_flag cudnn_dso_flag; +void* cudnn_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP); + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8 +CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_R7 +CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7 +CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7 +CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_R8 +CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP); +#endif + +bool HasCUDNN() { + std::call_once(cudnn_dso_flag, + []() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); + return cudnn_dso_handle != nullptr; +} + +void EnforceCUDNNLoaded(const char* fn_name) { + PADDLE_ENFORCE_NOT_NULL( + cudnn_dso_handle, + paddle::platform::errors::PreconditionNotMet( + "Cannot load cudnn shared library. Cannot invoke method %s.", + fn_name)); +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cudnn.h b/paddle/pten/backends/dynload/cudnn.h new file mode 100644 index 00000000000..7e084dfe3a6 --- /dev/null +++ b/paddle/pten/backends/dynload/cudnn.h @@ -0,0 +1,199 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_CUDA +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag cudnn_dso_flag; +extern void* cudnn_dso_handle; +extern bool HasCUDNN(); + +extern void EnforceCUDNNLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cudnn_func = decltype(&::__name); \ + std::call_once(cudnn_dso_flag, []() { \ + cudnn_dso_handle = pten::dynload::GetCUDNNDsoHandle(); \ + }); \ + EnforceCUDNNLoaded(#__name); \ + static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed cudnn functions in HPPL + * different cudnn version has different interfaces + **/ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor); \ + __macro(cudnnSetTensor4dDescriptorEx); \ + __macro(cudnnSetTensorNdDescriptor); \ + __macro(cudnnGetTensorNdDescriptor); \ + __macro(cudnnGetConvolutionNdForwardOutputDim); \ + __macro(cudnnCreateTensorDescriptor); \ + __macro(cudnnDestroyTensorDescriptor); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnSetFilter4dDescriptor); \ + __macro(cudnnSetFilterNdDescriptor); \ + __macro(cudnnGetFilterNdDescriptor); \ + __macro(cudnnSetPooling2dDescriptor); \ + __macro(cudnnSetPoolingNdDescriptor); \ + __macro(cudnnGetPoolingNdDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnCreateConvolutionDescriptor); \ + __macro(cudnnCreatePoolingDescriptor); \ + __macro(cudnnDestroyPoolingDescriptor); \ + __macro(cudnnSetConvolution2dDescriptor); \ + __macro(cudnnDestroyConvolutionDescriptor); \ + __macro(cudnnSetConvolutionNdDescriptor); \ + __macro(cudnnGetConvolutionNdDescriptor); \ + __macro(cudnnDeriveBNTensorDescriptor); \ + __macro(cudnnCreateSpatialTransformerDescriptor); \ + __macro(cudnnSetSpatialTransformerNdDescriptor); \ + __macro(cudnnDestroySpatialTransformerDescriptor); \ + __macro(cudnnSpatialTfGridGeneratorForward); \ + __macro(cudnnSpatialTfGridGeneratorBackward); \ + __macro(cudnnSpatialTfSamplerForward); \ + __macro(cudnnSpatialTfSamplerBackward); \ + __macro(cudnnCreate); \ + __macro(cudnnDestroy); \ + __macro(cudnnSetStream); \ + __macro(cudnnActivationForward); \ + __macro(cudnnActivationBackward); \ + __macro(cudnnConvolutionForward); \ + __macro(cudnnConvolutionBackwardBias); \ + __macro(cudnnGetConvolutionForwardWorkspaceSize); \ + __macro(cudnnTransformTensor); \ + __macro(cudnnPoolingForward); \ + __macro(cudnnPoolingBackward); \ + __macro(cudnnSoftmaxBackward); \ + __macro(cudnnSoftmaxForward); \ + __macro(cudnnGetVersion); \ + __macro(cudnnFindConvolutionForwardAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithm); \ + __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ + __macro(cudnnGetErrorString); \ + __macro(cudnnCreateDropoutDescriptor); \ + __macro(cudnnDropoutGetStatesSize); \ + __macro(cudnnSetDropoutDescriptor); \ + __macro(cudnnRestoreDropoutDescriptor); \ + __macro(cudnnCreateRNNDescriptor); \ + __macro(cudnnGetRNNParamsSize); \ + __macro(cudnnGetRNNWorkspaceSize); \ + __macro(cudnnGetRNNTrainingReserveSize); \ + __macro(cudnnRNNForwardTraining); \ + __macro(cudnnRNNBackwardData); \ + __macro(cudnnRNNBackwardWeights); \ + __macro(cudnnRNNForwardInference); \ + __macro(cudnnDestroyDropoutDescriptor); \ + __macro(cudnnDestroyRNNDescriptor); \ + __macro(cudnnSetTensorNdDescriptorEx); \ + __macro(cudnnAddTensor); \ + __macro(cudnnConvolutionBackwardData); \ + __macro(cudnnConvolutionBackwardFilter); \ + __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \ + __macro(cudnnGetConvolutionBackwardDataWorkspaceSize); \ + __macro(cudnnBatchNormalizationForwardTraining); \ + __macro(cudnnBatchNormalizationForwardInference); \ + __macro(cudnnBatchNormalizationBackward); \ + __macro(cudnnCreateActivationDescriptor); \ + __macro(cudnnSetActivationDescriptor); \ + __macro(cudnnGetActivationDescriptor); \ + __macro(cudnnDestroyActivationDescriptor); \ + __macro(cudnnSetRNNDescriptor_v6); +CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ + __macro(cudnnGetConvolutionForwardAlgorithm); \ + __macro(cudnnGetConvolutionBackwardDataAlgorithm); \ + __macro(cudnnSetRNNDescriptor); +CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +#if CUDNN_VERSION >= 7001 +#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ + __macro(cudnnSetConvolutionGroupCount); \ + __macro(cudnnSetConvolutionMathType); \ + __macro(cudnnConvolutionBiasActivationForward); \ + __macro(cudnnCreateCTCLossDescriptor); \ + __macro(cudnnDestroyCTCLossDescriptor); \ + __macro(cudnnGetCTCLossDescriptor); \ + __macro(cudnnSetCTCLossDescriptor); \ + __macro(cudnnGetCTCLossWorkspaceSize); \ + __macro(cudnnCTCLoss); \ + __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \ + __macro(cudnnGetConvolutionForwardAlgorithm_v7); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount); +CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +#if CUDNN_VERSION >= 7201 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \ + __macro(cudnnCreateRNNDataDescriptor); \ + __macro(cudnnDestroyRNNDataDescriptor); \ + __macro(cudnnSetRNNDataDescriptor); \ + __macro(cudnnSetRNNPaddingMode); \ + __macro(cudnnRNNForwardTrainingEx); \ + __macro(cudnnRNNBackwardDataEx); \ + __macro(cudnnRNNBackwardWeightsEx); \ + __macro(cudnnRNNForwardInferenceEx); +CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +#if CUDNN_VERSION >= 7401 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \ + __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \ + __macro(cudnnBatchNormalizationForwardTrainingEx); \ + __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ + __macro(cudnnBatchNormalizationBackwardEx); \ + __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); +CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +#if CUDNN_VERSION >= 8000 +#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) \ + __macro(cudnnSetRNNDescriptor_v8); \ + __macro(cudnnCreateFusedOpsPlan); \ + __macro(cudnnCreateFusedOpsConstParamPack); \ + __macro(cudnnCreateFusedOpsVariantParamPack); \ + __macro(cudnnDestroyFusedOpsPlan); \ + __macro(cudnnDestroyFusedOpsConstParamPack); \ + __macro(cudnnDestroyFusedOpsVariantParamPack); \ + __macro(cudnnFusedOpsExecute); \ + __macro(cudnnSetFusedOpsConstParamPackAttribute); \ + __macro(cudnnSetFusedOpsVariantParamPackAttribute); \ + __macro(cudnnMakeFusedOpsPlan); +CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +} // namespace dynload +} // namespace pten + +#endif diff --git a/paddle/pten/backends/dynload/cufft.cc b/paddle/pten/backends/dynload/cufft.cc new file mode 100644 index 00000000000..5e146690e8d --- /dev/null +++ b/paddle/pten/backends/dynload/cufft.cc @@ -0,0 +1,42 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/cufft.h" +#include "paddle/fluid/platform/enforce.h" + +namespace pten { +namespace dynload { +std::once_flag cufft_dso_flag; +void* cufft_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); + +bool HasCUFFT() { + std::call_once(cufft_dso_flag, + []() { cufft_dso_handle = GetCUFFTDsoHandle(); }); + return cufft_dso_handle != nullptr; +} + +void EnforceCUFFTLoaded(const char* fn_name) { + PADDLE_ENFORCE_NOT_NULL( + cufft_dso_handle, + paddle::platform::errors::PreconditionNotMet( + "Cannot load cufft shared library. Cannot invoke method %s.", + fn_name)); +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cufft.h b/paddle/pten/backends/dynload/cufft.h new file mode 100644 index 00000000000..7b0780b7316 --- /dev/null +++ b/paddle/pten/backends/dynload/cufft.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_CUDA +#include +#include +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag cufft_dso_flag; +extern void* cufft_dso_handle; +extern bool HasCUFFT(); + +extern void EnforceCUFFTLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cufft_func = decltype(&::__name); \ + std::call_once(cufft_dso_flag, []() { \ + cufft_dso_handle = pten::dynload::GetCUFFTDsoHandle(); \ + }); \ + EnforceCUFFTLoaded(#__name); \ + static void* p_##__name = dlsym(cufft_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed cufft functions in HPPL + * different cufft version has different interfaces + **/ +#define CUFFT_FFT_ROUTINE_EACH(__macro) \ + __macro(cufftPlan1d); \ + __macro(cufftPlan2d); \ + __macro(cufftPlan3d); \ + __macro(cufftPlanMany); \ + __macro(cufftMakePlan1d); \ + __macro(cufftMakePlan2d); \ + __macro(cufftMakePlan3d); \ + __macro(cufftMakePlanMany); \ + __macro(cufftMakePlanMany64); \ + __macro(cufftGetSizeMany64); \ + __macro(cufftEstimate1d); \ + __macro(cufftEstimate2d); \ + __macro(cufftEstimate3d); \ + __macro(cufftEstimateMany); \ + __macro(cufftCreate); \ + __macro(cufftGetSize1d); \ + __macro(cufftGetSize2d); \ + __macro(cufftGetSize3d); \ + __macro(cufftGetSizeMany); \ + __macro(cufftGetSize); \ + __macro(cufftSetWorkArea); \ + __macro(cufftSetAutoAllocation); \ + __macro(cufftExecC2C); \ + __macro(cufftExecR2C); \ + __macro(cufftExecC2R); \ + __macro(cufftExecZ2Z); \ + __macro(cufftExecD2Z); \ + __macro(cufftExecZ2D); \ + __macro(cufftSetStream); \ + __macro(cufftDestroy); \ + __macro(cufftGetVersion); \ + __macro(cufftGetProperty); \ + __macro(cufftXtSetGPUs); \ + __macro(cufftXtMalloc); \ + __macro(cufftXtMemcpy); \ + __macro(cufftXtFree); \ + __macro(cufftXtSetWorkArea); \ + __macro(cufftXtExecDescriptorC2C); \ + __macro(cufftXtExecDescriptorR2C); \ + __macro(cufftXtExecDescriptorC2R); \ + __macro(cufftXtExecDescriptorZ2Z); \ + __macro(cufftXtExecDescriptorD2Z); \ + __macro(cufftXtExecDescriptorZ2D); \ + __macro(cufftXtQueryPlan); \ + __macro(cufftXtSetCallback); \ + __macro(cufftXtClearCallback); \ + __macro(cufftXtSetCallbackSharedSize); \ + __macro(cufftXtMakePlanMany); \ + __macro(cufftXtGetSizeMany); \ + __macro(cufftXtExec); \ + __macro(cufftXtExecDescriptor); \ + __macro(cufftXtSetWorkAreaPolicy); + +CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP) + +} // namespace dynload +} // namespace pten + +#endif diff --git a/paddle/pten/backends/dynload/cupti.cc b/paddle/pten/backends/dynload/cupti.cc new file mode 100644 index 00000000000..91d202dbff4 --- /dev/null +++ b/paddle/pten/backends/dynload/cupti.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUPTI + +#include "paddle/pten/backends/dynload/cupti.h" + +namespace pten { +namespace dynload { + +std::once_flag cupti_dso_flag; +void *cupti_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUPTI_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten + +#endif // PADDLE_WITH_CUPTI diff --git a/paddle/pten/backends/dynload/cupti.h b/paddle/pten/backends/dynload/cupti.h new file mode 100644 index 00000000000..fbc6993f458 --- /dev/null +++ b/paddle/pten/backends/dynload/cupti.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_CUPTI + +#include +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag cupti_dso_flag; +extern void *cupti_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cupti routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline CUptiResult CUPTIAPI operator()(Args... args) { \ + using cuptiFunc = decltype(&::__name); \ + std::call_once(cupti_dso_flag, []() { \ + cupti_dso_handle = pten::dynload::GetCUPTIDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(cupti_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define CUPTI_ROUTINE_EACH(__macro) \ + __macro(cuptiActivityEnable); \ + __macro(cuptiActivityDisable); \ + __macro(cuptiActivityRegisterCallbacks); \ + __macro(cuptiActivityGetAttribute); \ + __macro(cuptiActivitySetAttribute); \ + __macro(cuptiGetTimestamp); \ + __macro(cuptiActivityGetNextRecord); \ + __macro(cuptiGetResultString); \ + __macro(cuptiActivityGetNumDroppedRecords); \ + __macro(cuptiActivityFlushAll); \ + __macro(cuptiSubscribe); \ + __macro(cuptiUnsubscribe); \ + __macro(cuptiEnableCallback); \ + __macro(cuptiEnableDomain); + +CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP +} // namespace dynload +} // namespace pten + +#endif // PADDLE_WITH_CUPTI diff --git a/paddle/pten/backends/dynload/cupti_lib_path.h.in b/paddle/pten/backends/dynload/cupti_lib_path.h.in new file mode 100644 index 00000000000..017384bfbb7 --- /dev/null +++ b/paddle/pten/backends/dynload/cupti_lib_path.h.in @@ -0,0 +1,17 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@" diff --git a/paddle/pten/backends/dynload/curand.cc b/paddle/pten/backends/dynload/curand.cc new file mode 100644 index 00000000000..7a3218f44ca --- /dev/null +++ b/paddle/pten/backends/dynload/curand.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/curand.h" + +namespace pten { +namespace dynload { + +std::once_flag curand_dso_flag; +void *curand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/curand.h b/paddle/pten/backends/dynload/curand.h new file mode 100644 index 00000000000..4ab8d179c37 --- /dev/null +++ b/paddle/pten/backends/dynload/curand.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { +extern std::once_flag curand_dso_flag; +extern void *curand_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + using curandFunc = decltype(&::__name); \ + std::call_once(curand_dso_flag, []() { \ + curand_dso_handle = pten::dynload::GetCurandDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define CURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(curandCreateGenerator); \ + __macro(curandSetStream); \ + __macro(curandSetPseudoRandomGeneratorSeed); \ + __macro(curandGenerateUniform); \ + __macro(curandGenerateUniformDouble); \ + __macro(curandGenerateNormal); \ + __macro(curandDestroyGenerator); + +CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cusolver.cc b/paddle/pten/backends/dynload/cusolver.cc new file mode 100644 index 00000000000..581aaabd8ae --- /dev/null +++ b/paddle/pten/backends/dynload/cusolver.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/cusolver.h" + +namespace pten { +namespace dynload { + +std::once_flag cusolver_dso_flag; +void *cusolver_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUSOLVER_ROUTINE_EACH(DEFINE_WRAP); + +#ifdef CUSOLVER_ROUTINE_EACH_R1 +CUSOLVER_ROUTINE_EACH_R1(DEFINE_WRAP); +#endif + +#ifdef CUSOLVER_ROUTINE_EACH_R2 +CUSOLVER_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cusolver.h b/paddle/pten/backends/dynload/cusolver.h new file mode 100644 index 00000000000..f352686a009 --- /dev/null +++ b/paddle/pten/backends/dynload/cusolver.h @@ -0,0 +1,123 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { +extern std::once_flag cusolver_dso_flag; +extern void *cusolver_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cusolverStatus_t operator()(Args... args) { \ + using cusolverFunc = decltype(&::__name); \ + std::call_once(cusolver_dso_flag, []() { \ + cusolver_dso_handle = pten::dynload::GetCusolverDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define CUSOLVER_ROUTINE_EACH(__macro) \ + __macro(cusolverDnCreate); \ + __macro(cusolverDnDestroy); \ + __macro(cusolverDnSetStream); \ + __macro(cusolverDnSpotrf_bufferSize); \ + __macro(cusolverDnDpotrf_bufferSize); \ + __macro(cusolverDnSpotrf); \ + __macro(cusolverDnDpotrf); \ + __macro(cusolverDnSpotrs); \ + __macro(cusolverDnDpotrs); \ + __macro(cusolverDnCpotrs); \ + __macro(cusolverDnZpotrs); \ + __macro(cusolverDnSsyevd_bufferSize); \ + __macro(cusolverDnDsyevd_bufferSize); \ + __macro(cusolverDnCheevd_bufferSize); \ + __macro(cusolverDnZheevd_bufferSize); \ + __macro(cusolverDnSsyevd); \ + __macro(cusolverDnDsyevd); \ + __macro(cusolverDnCheevd); \ + __macro(cusolverDnZheevd); + +CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); + +#if CUDA_VERSION >= 9020 +#define CUSOLVER_ROUTINE_EACH_R1(__macro) \ + __macro(cusolverDnSpotrfBatched); \ + __macro(cusolverDnDpotrfBatched); \ + __macro(cusolverDnSpotrsBatched); \ + __macro(cusolverDnDpotrsBatched); \ + __macro(cusolverDnSgesvdj_bufferSize); \ + __macro(cusolverDnSgetrf_bufferSize); \ + __macro(cusolverDnDgetrf_bufferSize); \ + __macro(cusolverDnCgetrf_bufferSize); \ + __macro(cusolverDnZgetrf_bufferSize); \ + __macro(cusolverDnSgeqrf_bufferSize); \ + __macro(cusolverDnDgeqrf_bufferSize); \ + __macro(cusolverDnCgeqrf_bufferSize); \ + __macro(cusolverDnZgeqrf_bufferSize); \ + __macro(cusolverDnSorgqr_bufferSize); \ + __macro(cusolverDnDorgqr_bufferSize); \ + __macro(cusolverDnSormqr_bufferSize); \ + __macro(cusolverDnDormqr_bufferSize); \ + __macro(cusolverDnCungqr_bufferSize); \ + __macro(cusolverDnZungqr_bufferSize); \ + __macro(cusolverDnDestroyGesvdjInfo); \ + __macro(cusolverDnCreateGesvdjInfo); \ + __macro(cusolverDnDgesvdj_bufferSize); \ + __macro(cusolverDnSgesvdj); \ + __macro(cusolverDnDgesvdj); \ + __macro(cusolverDnSgetrf); \ + __macro(cusolverDnDgetrf); \ + __macro(cusolverDnCgetrf); \ + __macro(cusolverDnZgetrf); \ + __macro(cusolverDnSgeqrf); \ + __macro(cusolverDnDgeqrf); \ + __macro(cusolverDnCgeqrf); \ + __macro(cusolverDnZgeqrf); \ + __macro(cusolverDnSorgqr); \ + __macro(cusolverDnDorgqr); \ + __macro(cusolverDnSormqr); \ + __macro(cusolverDnDormqr); \ + __macro(cusolverDnCungqr); \ + __macro(cusolverDnZungqr); + +CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) +#endif + +#if CUDA_VERSION >= 9020 +#define CUSOLVER_ROUTINE_EACH_R2(__macro) \ + __macro(cusolverDnCreateSyevjInfo); \ + __macro(cusolverDnSsyevj_bufferSize); \ + __macro(cusolverDnDsyevj_bufferSize); \ + __macro(cusolverDnSsyevj); \ + __macro(cusolverDnDsyevj); \ + __macro(cusolverDnDestroySyevjInfo); + +CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) +#endif + +#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cusparse.cc b/paddle/pten/backends/dynload/cusparse.cc new file mode 100644 index 00000000000..4277f14149f --- /dev/null +++ b/paddle/pten/backends/dynload/cusparse.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/cusparse.h" + +namespace pten { +namespace dynload { + +std::once_flag cusparse_dso_flag; +void *cusparse_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +#ifdef CUSPARSE_ROUTINE_EACH +CUSPARSE_ROUTINE_EACH(DEFINE_WRAP); +#endif + +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2 +CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif + +#ifdef CUSPARSE_ROUTINE_EACH_11020 +CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP); +#endif +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/cusparse.h b/paddle/pten/backends/dynload/cusparse.h new file mode 100644 index 00000000000..d9a7fab8779 --- /dev/null +++ b/paddle/pten/backends/dynload/cusparse.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { +extern std::once_flag cusparse_dso_flag; +extern void *cusparse_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cusparseStatus_t operator()(Args... args) { \ + using cusparseFunc = decltype(&::__name); \ + std::call_once(cusparse_dso_flag, []() { \ + cusparse_dso_handle = pten::dynload::GetCusparseDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#if defined(PADDLE_WITH_CUDA) +// The generic APIs is supported from CUDA10.1 +#if CUDA_VERSION >= 10010 +#define CUSPARSE_ROUTINE_EACH(__macro) \ + __macro(cusparseCreate); \ + __macro(cusparseSetStream); \ + __macro(cusparseCreateMatDescr); \ + __macro(cusparseDestroy); \ + __macro(cusparseSnnz); \ + __macro(cusparseDnnz); \ + __macro(cusparseSetMatType); \ + __macro(cusparseSetMatIndexBase); + +CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP); + +// APIs available after CUDA 11.2 +#if CUDA_VERSION >= 11020 +#define CUSPARSE_ROUTINE_EACH_11020(__macro) \ + __macro(cusparseCreateCsr); \ + __macro(cusparseCreateCoo); \ + __macro(cusparseCreateDnMat); \ + __macro(cusparseSpMM_bufferSize); \ + __macro(cusparseSpMM); \ + __macro(cusparseDestroySpMat); \ + __macro(cusparseDestroyDnMat); \ + __macro(cusparseCooSetPointers); \ + __macro(cusparseCsrSetPointers); \ + __macro(cusparseDenseToSparse_bufferSize); \ + __macro(cusparseDenseToSparse_analysis); \ + __macro(cusparseDenseToSparse_convert); \ + __macro(cusparseSparseToDense_bufferSize); \ + __macro(cusparseSparseToDense); + +CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) + +// APIs available after CUDA 11.3 +#if CUDA_VERSION >= 11030 +#define CUSPARSE_ROUTINE_EACH_R2(__macro) \ + __macro(cusparseSDDMM_bufferSize); \ + __macro(cusparseSDDMM_preprocess); \ + __macro(cusparseSDDMM); + +CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) +#endif +#endif +#endif +#endif + +#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/dynamic_loader.cc b/paddle/pten/backends/dynload/dynamic_loader.cc new file mode 100644 index 00000000000..2817b1520e2 --- /dev/null +++ b/paddle/pten/backends/dynload/dynamic_loader.cc @@ -0,0 +1,585 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/pten/backends/dynload/dynamic_loader.h" + +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/backends/dynload/cupti_lib_path.h" + +#if defined(_WIN32) +#include +#endif + +// TODO(wilber): The pten computing library requires a component to manage flags +// (maybe not use gflags). +#include "gflags/gflags.h" +#include "glog/logging.h" + +DEFINE_string(cudnn_dir, + "", + "Specify path for loading libcudnn.so. For instance, " + "/usr/local/cudnn/lib. If empty [default], dlopen " + "will search cudnn from LD_LIBRARY_PATH"); + +DEFINE_string( + cuda_dir, + "", + "Specify path for loading cuda library, such as libcublas, libcublasLt " + "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. " + "If default, dlopen will search cuda from LD_LIBRARY_PATH"); + +DEFINE_string(nccl_dir, + "", + "Specify path for loading nccl library, such as libnccl.so. " + "For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); + +DEFINE_string(hccl_dir, + "", + "Specify path for loading hccl library, such as libhccl.so. " + "For instance, " + "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If " + "default, " + "dlopen will search hccl from LD_LIBRARY_PATH"); + +DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); + +DEFINE_string( + tensorrt_dir, + "", + "Specify path for loading tensorrt library, such as libnvinfer.so."); + +DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); + +DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); + +DEFINE_string(mkl_dir, + "", + "Specify path for loading libmkl_rt.so. " + "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/." + "If default, " + "dlopen will search mkl from LD_LIBRARY_PATH"); + +DEFINE_string(op_dir, "", "Specify path for loading user-defined op library."); + +#ifdef PADDLE_WITH_HIP + +DEFINE_string(miopen_dir, + "", + "Specify path for loading libMIOpen.so. For instance, " + "/opt/rocm/miopen/lib. If empty [default], dlopen " + "will search miopen from LD_LIBRARY_PATH"); + +DEFINE_string(rocm_dir, + "", + "Specify path for loading rocm library, such as librocblas, " + "libmiopen, libhipsparse. For instance, /opt/rocm/lib. " + "If default, dlopen will search rocm from LD_LIBRARY_PATH"); + +DEFINE_string(rccl_dir, + "", + "Specify path for loading rccl library, such as librccl.so. " + "For instance, /opt/rocm/rccl/lib. If default, " + "dlopen will search rccl from LD_LIBRARY_PATH"); +#endif + +namespace pten { +namespace dynload { + +struct PathNode { + PathNode() {} + std::string path = ""; +}; + +static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; + +// NOTE: In order to adapt to the default installation path of cuda +#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) +static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin"; +#else +static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64"; +#endif + +static PathNode s_py_site_pkg_path; + +#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) +static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll"; +static constexpr char* win_cublas_lib = + "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll"; +#if CUDA_VERSION >= 11000 +static constexpr char* win_curand_lib = + "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll"; +static constexpr char* win_nvjpeg_lib = + "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll"; +static constexpr char* win_cusolver_lib = + "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll"; +static constexpr char* win_cusparse_lib = + "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll"; +static constexpr char* win_cufft_lib = + "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll"; +#else +static constexpr char* win_curand_lib = + "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;curand64_" CUDA_VERSION_MAJOR ".dll"; +static constexpr char* win_nvjpeg_lib = + "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll"; +static constexpr char* win_cusolver_lib = + "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll"; +static constexpr char* win_cusparse_lib = + "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll"; +static constexpr char* win_cufft_lib = + "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll"; +#endif // CUDA_VERSION +#endif + +static inline std::string join(const std::string& part1, + const std::string& part2) { + // directory separator + const char sep = '/'; + if (!part2.empty() && part2.front() == sep) { + return part2; + } + std::string ret; + ret.reserve(part1.size() + part2.size() + 1); + ret = part1; + if (!ret.empty() && ret.back() != sep) { + ret += sep; + } + ret += part2; + return ret; +} + +static inline std::vector split( + const std::string& str, const std::string separator = " ") { + std::vector str_list; + std::string::size_type firstPos; + firstPos = str.find_first_not_of(separator, 0); + std::string::size_type lastPos; + lastPos = str.find_first_of(separator, firstPos); + while (std::string::npos != firstPos && std::string::npos != lastPos) { + str_list.push_back(str.substr(firstPos, lastPos - firstPos)); + firstPos = str.find_first_not_of(separator, lastPos); + lastPos = str.find_first_of(separator, firstPos); + } + if (std::string::npos == lastPos) { + str_list.push_back(str.substr(firstPos, lastPos - firstPos)); + } + return str_list; +} + +void SetPaddleLibPath(const std::string& py_site_pkg_path) { + s_py_site_pkg_path.path = py_site_pkg_path; + VLOG(3) << "Set paddle lib path : " << py_site_pkg_path; +} + +static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path, + const std::string& dso_name, + int dynload_flags) { + void* dso_handle = nullptr; + if (!spec_path.empty()) { + // search xxx.so from custom path + VLOG(3) << "Try to find library: " << dso_name + << " from specific path: " << spec_path; + std::string dso_path = join(spec_path, dso_name); + dso_handle = dlopen(dso_path.c_str(), dynload_flags); + } + return dso_handle; +} + +static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, + int dynload_flags) { + // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH + // and /usr/local/lib path + void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); + VLOG(3) << "Try to find library: " << dso_path + << " from default system path."; + +// TODO(chenweihang): This path is used to search which libs? +// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to +// bring System Integrity Projection (SIP), if dso_handle +// is null, search from default package path in Mac OS. +#if defined(__APPLE__) || defined(__OSX__) + if (nullptr == dso_handle) { + dso_handle = + dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags); + } +#endif + + return dso_handle; +} + +/* + * We define three priorities for dynamic library search: + * + * First: Search for path specified by the user + * Second: Search the stheystem default path + * Third: Search for a special path corresponding to + * a specific library to adapt to changes and easy to expand. + */ + +static inline void* GetDsoHandleFromSearchPath( + const std::string& config_path, + const std::string& dso_name, + bool throw_on_error = true, + const std::vector& extra_paths = std::vector(), + const std::string& warning_msg = std::string()) { +#if !defined(_WIN32) + int dynload_flags = RTLD_LAZY | RTLD_LOCAL; +#else + int dynload_flags = 0; +#endif // !_WIN32 + std::vector dso_names = split(dso_name, ";"); + void* dso_handle = nullptr; + for (auto dso : dso_names) { + // 1. search in user config path by FLAGS + dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags); + // 2. search in system default path + if (nullptr == dso_handle) { + dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags); + } + // 3. search in extra paths + if (nullptr == dso_handle) { + for (auto path : extra_paths) { + VLOG(3) << "extra_paths: " << path; + dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags); + } + } + if (nullptr != dso_handle) break; + } + + // 4. [If Failed for All dso_names] logging warning if exists + if (nullptr == dso_handle && !warning_msg.empty()) { + LOG(WARNING) << warning_msg; + } + + // 5. [If Failed for All dso_names] logging or throw error info + if (nullptr == dso_handle) { + auto error_msg = + "The third-party dynamic library (%s) that Paddle depends on is not " + "configured correctly. (error code is %s)\n" + " Suggestions:\n" + " 1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) " + "is installed correctly and its version is matched with paddlepaddle " + "you installed.\n" + " 2. Configure third-party dynamic library environment variables as " + "follows:\n" + " - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n" + " - Windows: set PATH by `set PATH=XXX;%PATH%`\n" + " - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` " + "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is " + "impossible unless System Integrity Protection (SIP) is disabled.]"; +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 + if (throw_on_error) { + // NOTE: Special error report case, no need to change its format + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + error_msg, dso_name, errorno)); + } else { + LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno); + } + } + + return dso_handle; +} + +void* GetCublasDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); +#endif +} + +void* GetCublasLtDsoHandle() { +// APIs available after CUDA 10.1 +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100 + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so"); +#else + std::string warning_msg( + "Your CUDA_VERSION less 10.1, not support CublasLt. " + "If you want to use CublasLt, please upgrade CUDA and rebuild " + "PaddlePaddle."); + return nullptr; +#endif +} + +void* GetCUDNNDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + std::string mac_warn_meg( + "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " + "For instance, sudo tar -xzf " + "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " + "chmod a+r /usr/local/cuda/include/cudnn.h " + "/usr/local/cuda/lib/libcudnn*"); + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, "libcudnn.dylib", false, {}, mac_warn_meg); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + std::string win_warn_meg( + "Note: [Recommend] copy cudnn into CUDA installation directory. \n " + "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from " + "NVIDIA's official website, \n" + "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing " + "Toolkit\\CUDA\\v10.0\n" + "You should do this according to your CUDA installation directory and " + "CUDNN version."); + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path}); +#endif +} + +void* GetCUPTIDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path}); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path}); +#endif +} + +void* GetCurandDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path}); +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); +#endif +} + +#ifdef PADDLE_WITH_HIP +void* GetROCFFTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so"); +#endif +} +#endif + +void* GetNvjpegDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_nvjpeg_lib, true, {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so"); +#endif +} + +void* GetCusolverDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so"); +#endif +} + +void* GetCusparseDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so"); +#endif +} + +void* GetNVRTCDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); +#endif +} + +void* GetCUDADsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); +#elif defined(_WIN32) + char system32_dir[MAX_PATH]; + GetSystemDirectory(system32_dir, MAX_PATH); + return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false); +#endif +} + +void* GetWarpCTCDsoHandle() { + std::string warpctc_dir = ""; + if (!s_py_site_pkg_path.path.empty()) { + warpctc_dir = s_py_site_pkg_path.path; + } +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll"); +#else + return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so"); +#endif +} + +void* GetNCCLDsoHandle() { +#ifdef PADDLE_WITH_HIP + std::string warning_msg( + "You may need to install 'rccl' from ROCM official website: " + "https://rocmdocs.amd.com/en/latest/Installation_Guide/" + "Installation-Guide.html before install PaddlePaddle."); +#else + std::string warning_msg( + "You may need to install 'nccl2' from NVIDIA official website: " + "https://developer.nvidia.com/nccl/nccl-download" + "before install PaddlePaddle."); +#endif + +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath( + FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg); +#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) + return GetDsoHandleFromSearchPath( + FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg); +#else + return GetDsoHandleFromSearchPath( + FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); +#endif +} +void* GetHCCLDsoHandle() { + std::string warning_msg( + "You may need to install 'hccl2' from Huawei official website: " + "before install PaddlePaddle."); +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath( + FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg); +#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) + return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true); + +#elif defined(PADDLE_WITH_ASCEND_CL) + return GetDsoHandleFromSearchPath( + FLAGS_hccl_dir, "libhccl.so", true, {}, warning_msg); +#else + return GetDsoHandleFromSearchPath( + FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); +#endif +} + +void* GetTensorRtDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so"); +#endif +} + +void* GetMKLMLDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); +#endif +} + +void* GetLAPACKDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3"); +#endif +} + +void* GetOpDsoHandle(const std::string& dso_name) { + return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name); +} + +void* GetNvtxDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + PADDLE_THROW( + paddle::platform::errors::Unimplemented("Nvtx do not support Apple.")); +#elif defined(_WIN32) + PADDLE_THROW( + paddle::platform::errors::Unimplemented("Nvtx do not support Windows.")); +#elif !defined(PADDLE_WITH_CUDA) + PADDLE_THROW(paddle::platform::errors::Unimplemented( + "Nvtx do not support without CUDA.")); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so"); +#endif +} + +void* GetCUFFTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so"); +#endif +} + +void* GetMKLRTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so"); +#endif +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/dynamic_loader.h b/paddle/pten/backends/dynload/dynamic_loader.h new file mode 100644 index 00000000000..d7c7a87d33d --- /dev/null +++ b/paddle/pten/backends/dynload/dynamic_loader.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace pten { +namespace dynload { + +#ifndef _WIN32 +#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__)) +#else +#define DECLARE_TYPE(__name, ...) decltype(auto) +#endif + +void* GetCublasDsoHandle(); +void* GetCublasLtDsoHandle(); +void* GetCUDNNDsoHandle(); +void* GetCUPTIDsoHandle(); +void* GetCurandDsoHandle(); +void* GetNvjpegDsoHandle(); +void* GetCusolverDsoHandle(); +void* GetCusparseDsoHandle(); +void* GetNVRTCDsoHandle(); +void* GetCUDADsoHandle(); +void* GetWarpCTCDsoHandle(); +void* GetNCCLDsoHandle(); +void* GetHCCLDsoHandle(); +void* GetTensorRtDsoHandle(); +void* GetMKLMLDsoHandle(); +void* GetLAPACKDsoHandle(); +void* GetOpDsoHandle(const std::string& dso_name); +void* GetNvtxDsoHandle(); +void* GetCUFFTDsoHandle(); +void* GetMKLRTDsoHandle(); +void* GetROCFFTDsoHandle(); + +void SetPaddleLibPath(const std::string&); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/hipfft.cc b/paddle/pten/backends/dynload/hipfft.cc new file mode 100644 index 00000000000..a1d802fac04 --- /dev/null +++ b/paddle/pten/backends/dynload/hipfft.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/hipfft.h" + +namespace pten { +namespace dynload { + +std::once_flag hipfft_dso_flag; +void *hipfft_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/hipfft.h b/paddle/pten/backends/dynload/hipfft.h new file mode 100644 index 00000000000..a6f4e6ca8ce --- /dev/null +++ b/paddle/pten/backends/dynload/hipfft.h @@ -0,0 +1,122 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#ifdef PADDLE_WITH_HIP +#include + +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { +extern std::once_flag hipfft_dso_flag; +extern void *hipfft_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using hipfftFunc = decltype(&::__name); \ + std::call_once(hipfft_dso_flag, []() { \ + hipfft_dso_handle = pten::dynload::GetROCFFTDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define HIPFFT_FFT_ROUTINE_EACH(__macro) \ + __macro(hipfftPlan1d); \ + __macro(hipfftPlan2d); \ + __macro(hipfftPlan3d); \ + __macro(hipfftPlanMany); \ + __macro(hipfftMakePlan1d); \ + __macro(hipfftMakePlanMany); \ + __macro(hipfftMakePlanMany64); \ + __macro(hipfftGetSizeMany64); \ + __macro(hipfftEstimate1d); \ + __macro(hipfftEstimate2d); \ + __macro(hipfftEstimate3d); \ + __macro(hipfftEstimateMany); \ + __macro(hipfftCreate); \ + __macro(hipfftGetSize1d); \ + __macro(hipfftGetSizeMany); \ + __macro(hipfftGetSize); \ + __macro(hipfftSetWorkArea); \ + __macro(hipfftSetAutoAllocation); \ + __macro(hipfftExecC2C); \ + __macro(hipfftExecR2C); \ + __macro(hipfftExecC2R); \ + __macro(hipfftExecZ2Z); \ + __macro(hipfftExecD2Z); \ + __macro(hipfftExecZ2D); \ + __macro(hipfftSetStream); \ + __macro(hipfftDestroy); \ + __macro(hipfftGetVersion); \ + __macro(hipfftGetProperty); + +HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP); + +inline const char *hipfftGetErrorString(hipfftResult_t status) { + switch (status) { + case HIPFFT_SUCCESS: + return "'HIPFFT_SUCCESS'. The hipFFT operation was successful."; + case HIPFFT_INVALID_PLAN: + return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle."; + case HIPFFT_ALLOC_FAILED: + return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU " + "memory."; + case HIPFFT_INVALID_TYPE: + return "'HIPFFT_INVALID_TYPE'. No longer used."; + case HIPFFT_INVALID_VALUE: + return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or " + "parameter."; + case HIPFFT_INTERNAL_ERROR: + return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library " + "error."; + case HIPFFT_EXEC_FAILED: + return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU."; + case HIPFFT_SETUP_FAILED: + return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize."; + case HIPFFT_INVALID_SIZE: + return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size."; + case HIPFFT_UNALIGNED_DATA: + return "'HIPFFT_UNALIGNED_DATA'. No longer used."; + case HIPFFT_INCOMPLETE_PARAMETER_LIST: + return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call."; + case HIPFFT_INVALID_DEVICE: + return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different " + "GPU than plan creation."; + case HIPFFT_PARSE_ERROR: + return "'HIPFFT_PARSE_ERROR'. Internal plan database error."; + case HIPFFT_NO_WORKSPACE: + return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to " + "plan execution."; + case HIPFFT_NOT_IMPLEMENTED: + return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement " + "functionality for parameters given."; + case HIPFFT_NOT_SUPPORTED: + return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for " + "parameters given."; + default: + return "HIPFFT_STATUS_UNKNOWN_ERROR"; + } +} +} // namespace dynload +} // namespace pten + +#endif diff --git a/paddle/pten/backends/dynload/hiprand.cc b/paddle/pten/backends/dynload/hiprand.cc new file mode 100644 index 00000000000..e7a5acaa112 --- /dev/null +++ b/paddle/pten/backends/dynload/hiprand.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/hiprand.h" + +namespace pten { +namespace dynload { + +std::once_flag hiprand_dso_flag; +void *hiprand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/hiprand.h b/paddle/pten/backends/dynload/hiprand.h new file mode 100644 index 00000000000..b0c16da3dbc --- /dev/null +++ b/paddle/pten/backends/dynload/hiprand.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT +#include "paddle/pten/backends/dynload/port.h" + +#include "paddle/pten/backends/dynload/dynamic_loader.h" + +namespace pten { +namespace dynload { +extern std::once_flag hiprand_dso_flag; +extern void *hiprand_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + hiprandStatus_t operator()(Args... args) { \ + using hiprandFunc = decltype(&::__name); \ + std::call_once(hiprand_dso_flag, []() { \ + hiprand_dso_handle = pten::dynload::GetCurandDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define HIPRAND_RAND_ROUTINE_EACH(__macro) \ + __macro(hiprandCreateGenerator); \ + __macro(hiprandSetStream); \ + __macro(hiprandSetPseudoRandomGeneratorSeed); \ + __macro(hiprandGenerateUniform); \ + __macro(hiprandGenerateUniformDouble); \ + __macro(hiprandGenerateNormal); \ + __macro(hiprandDestroyGenerator); + +HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/hiprtc.cc b/paddle/pten/backends/dynload/hiprtc.cc new file mode 100644 index 00000000000..7ae1e2ab10f --- /dev/null +++ b/paddle/pten/backends/dynload/hiprtc.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/hiprtc.h" + +namespace pten { +namespace dynload { + +std::once_flag hiprtc_dso_flag; +void* hiprtc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +HIPRTC_ROUTINE_EACH(DEFINE_WRAP); + +bool HasNVRTC() { + std::call_once(hiprtc_dso_flag, + []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); }); + return hiprtc_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/hiprtc.h b/paddle/pten/backends/dynload/hiprtc.h new file mode 100644 index 00000000000..76c1753e981 --- /dev/null +++ b/paddle/pten/backends/dynload/hiprtc.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag hiprtc_dso_flag; +extern void* hiprtc_dso_handle; +extern bool HasNVRTC(); + +#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using hiprtc_func = decltype(&::__name); \ + std::call_once(hiprtc_dso_flag, []() { \ + hiprtc_dso_handle = pten::dynload::GetNVRTCDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed hiprtc functions + **/ +#define HIPRTC_ROUTINE_EACH(__macro) \ + __macro(hiprtcVersion); \ + __macro(hiprtcGetErrorString); \ + __macro(hiprtcCompileProgram); \ + __macro(hiprtcCreateProgram); \ + __macro(hiprtcDestroyProgram); \ + __macro(hiprtcGetCode); \ + __macro(hiprtcGetCodeSize); \ + __macro(hiprtcGetProgramLog); \ + __macro(hiprtcGetProgramLogSize) + +HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/lapack.cc b/paddle/pten/backends/dynload/lapack.cc new file mode 100644 index 00000000000..847f4528dae --- /dev/null +++ b/paddle/pten/backends/dynload/lapack.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/lapack.h" +#include + +namespace pten { +namespace dynload { + +std::once_flag lapack_dso_flag; +void* lapack_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +LAPACK_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/lapack.h b/paddle/pten/backends/dynload/lapack.h new file mode 100644 index 00000000000..b5636850f8d --- /dev/null +++ b/paddle/pten/backends/dynload/lapack.h @@ -0,0 +1,340 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +// Note(zhouwei): because lapack doesn't provide appropriate header file. +// should expose API statement yourself. + +// getrf_(For example) +extern "C" void dgetrf_( + int *m, int *n, double *a, int *lda, int *ipiv, int *info); +extern "C" void sgetrf_( + int *m, int *n, float *a, int *lda, int *ipiv, int *info); + +// evd +extern "C" void zheevd_(char *jobz, + char *uplo, + int *n, + std::complex *a, + int *lda, + double *w, + std::complex *work, + int *lwork, + double *rwork, + int *lrwork, + int *iwork, + int *liwork, + int *info); +extern "C" void cheevd_(char *jobz, + char *uplo, + int *n, + std::complex *a, + int *lda, + float *w, + std::complex *work, + int *lwork, + float *rwork, + int *lrwork, + int *iwork, + int *liwork, + int *info); +extern "C" void dsyevd_(char *jobz, + char *uplo, + int *n, + double *a, + int *lda, + double *w, + double *work, + int *lwork, + int *iwork, + int *liwork, + int *info); +extern "C" void ssyevd_(char *jobz, + char *uplo, + int *n, + float *a, + int *lda, + float *w, + float *work, + int *lwork, + int *iwork, + int *liwork, + int *info); + +// geev +extern "C" void dgeev_(char *jobvl, + char *jobvr, + int *n, + double *a, + int *lda, + double *wr, + double *wi, + double *vl, + int *ldvl, + double *vr, + int *ldvr, + double *work, + int *lwork, + int *info); +extern "C" void sgeev_(char *jobvl, + char *jobvr, + int *n, + float *a, + int *lda, + float *wr, + float *wi, + float *vl, + int *ldvl, + float *vr, + int *ldvr, + float *work, + int *lwork, + int *info); +extern "C" void zgeev_(char *jobvl, + char *jobvr, + int *n, + std::complex *a, + int *lda, + std::complex *w, + std::complex *vl, + int *ldvl, + std::complex *vr, + int *ldvr, + std::complex *work, + int *lwork, + double *rwork, + int *info); +extern "C" void cgeev_(char *jobvl, + char *jobvr, + int *n, + std::complex *a, + int *lda, + std::complex *w, + std::complex *vl, + int *ldvl, + std::complex *vr, + int *ldvr, + std::complex *work, + int *lwork, + float *rwork, + int *info); + +// gels +extern "C" void dgels_(char *trans, + int *m, + int *n, + int *nrhs, + double *a, + int *lda, + double *b, + int *ldb, + double *work, + int *lwork, + int *info); +extern "C" void sgels_(char *trans, + int *m, + int *n, + int *nrhs, + float *a, + int *lda, + float *b, + int *ldb, + float *work, + int *lwork, + int *info); + +// gelsd +extern "C" void dgelsd_(int *m, + int *n, + int *nrhs, + double *a, + int *lda, + double *b, + int *ldb, + double *s, + double *rcond, + int *rank, + double *work, + int *lwork, + int *iwork, + int *info); +extern "C" void sgelsd_(int *m, + int *n, + int *nrhs, + float *a, + int *lda, + float *b, + int *ldb, + float *s, + float *rcond, + int *rank, + float *work, + int *lwork, + int *iwork, + int *info); + +// gelsy +extern "C" void dgelsy_(int *m, + int *n, + int *nrhs, + double *a, + int *lda, + double *b, + int *ldb, + int *jpvt, + double *rcond, + int *rank, + double *work, + int *lwork, + int *info); +extern "C" void sgelsy_(int *m, + int *n, + int *nrhs, + float *a, + int *lda, + float *b, + int *ldb, + int *jpvt, + float *rcond, + int *rank, + float *work, + int *lwork, + int *info); + +// gelss +extern "C" void dgelss_(int *m, + int *n, + int *nrhs, + double *a, + int *lda, + double *b, + int *ldb, + double *s, + double *rcond, + int *rank, + double *work, + int *lwork, + int *info); +extern "C" void sgelss_(int *m, + int *n, + int *nrhs, + float *a, + int *lda, + float *b, + int *ldb, + float *s, + float *rcond, + int *rank, + float *work, + int *lwork, + int *info); + +extern "C" void zpotrs_(char *uplo, + int *n, + int *nrhs, + std::complex *a, + int *lda, + std::complex *b, + int *ldb, + int *info); +extern "C" void cpotrs_(char *uplo, + int *n, + int *nrhs, + std::complex *a, + int *lda, + std::complex *b, + int *ldb, + int *info); +extern "C" void dpotrs_(char *uplo, + int *n, + int *nrhs, + double *a, + int *lda, + double *b, + int *ldb, + int *info); +extern "C" void spotrs_(char *uplo, + int *n, + int *nrhs, + float *a, + int *lda, + float *b, + int *ldb, + int *info); + +namespace pten { +namespace dynload { + +extern std::once_flag lapack_dso_flag; +extern void *lapack_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load lapack routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using lapackFunc = decltype(&::__name); \ + std::call_once(lapack_dso_flag, []() { \ + lapack_dso_handle = pten::dynload::GetLAPACKDsoHandle(); \ + }); \ + static void *p_##_name = dlsym(lapack_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \ + DYNAMIC_LOAD_LAPACK_WRAP(__name) + +#define LAPACK_ROUTINE_EACH(__macro) \ + __macro(dgetrf_); \ + __macro(sgetrf_); \ + __macro(zheevd_); \ + __macro(cheevd_); \ + __macro(dsyevd_); \ + __macro(ssyevd_); \ + __macro(dgeev_); \ + __macro(sgeev_); \ + __macro(zgeev_); \ + __macro(cgeev_); \ + __macro(dgels_); \ + __macro(sgels_); \ + __macro(dgelsd_); \ + __macro(sgelsd_); \ + __macro(dgelsy_); \ + __macro(sgelsy_); \ + __macro(dgelss_); \ + __macro(sgelss_); \ + __macro(zpotrs_); \ + __macro(cpotrs_); \ + __macro(dpotrs_); \ + __macro(spotrs_); + +LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP); + +#undef DYNAMIC_LOAD_LAPACK_WRAP + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/miopen.cc b/paddle/pten/backends/dynload/miopen.cc new file mode 100644 index 00000000000..eaa97f6d850 --- /dev/null +++ b/paddle/pten/backends/dynload/miopen.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/miopen.h" +#include "paddle/fluid/platform/enforce.h" + +namespace pten { +namespace dynload { +std::once_flag miopen_dso_flag; +void* miopen_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MIOPEN_DNN_ROUTINE_EACH(DEFINE_WRAP); +MIOPEN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP); + +#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R3 +MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R4 +MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_R5 +MIOPEN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_R6 +MIOPEN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_R7 +MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R7 +MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); +#endif + +bool HasCUDNN() { + std::call_once(miopen_dso_flag, + []() { miopen_dso_handle = GetCUDNNDsoHandle(); }); + return miopen_dso_handle != nullptr; +} + +void EnforceCUDNNLoaded(const char* fn_name) { + PADDLE_ENFORCE_NOT_NULL( + miopen_dso_handle, + paddle::platform::errors::PreconditionNotMet( + "Cannot load miopen shared library. Cannot invoke method %s.", + fn_name)); +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/miopen.h b/paddle/pten/backends/dynload/miopen.h new file mode 100644 index 00000000000..9868953fc2f --- /dev/null +++ b/paddle/pten/backends/dynload/miopen.h @@ -0,0 +1,196 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include +#include +#include // NOLINT +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +#define MIOPEN_VERSION \ + (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \ + MIOPEN_VERSION_PATCH) // NOLINT + +// MIOPEN only support NCHW, just for compatibility with CUDNN API +typedef enum { + MIOPEN_TENSOR_NCHW = 0, + MIOPEN_TENSOR_NHWC = 1, +} miopenTensorFormat_t; + +namespace pten { +namespace dynload { + +extern std::once_flag miopen_dso_flag; +extern void* miopen_dso_handle; +extern bool HasCUDNN(); + +inline const char* miopenGetErrorString(miopenStatus_t status) { + switch (status) { + case miopenStatusSuccess: + return "MIOPEN_STATUS_SUCCESS"; + case miopenStatusNotInitialized: + return "MIOPEN_STATUS_NOT_INITIALIZED"; + case miopenStatusInvalidValue: + return "MIOPEN_STATUS_INVALID_VALUE"; + case miopenStatusBadParm: + return "MIOPEN_STATUS_BAD_PARAM"; + case miopenStatusAllocFailed: + return "MIOPEN_STATUS_ALLOC_FAILED"; + case miopenStatusInternalError: + return "MIOPEN_STATUS_INTERNAL_ERROR"; + case miopenStatusNotImplemented: + return "MIOPEN_STATUS_NOT_IMPLEMENTED"; + case miopenStatusUnsupportedOp: + return "MIOPEN_STATUS_UNSUPPORTED_OP"; + case miopenStatusUnknownError: + default: + return "MIOPEN_STATUS_UNKNOWN_ERROR"; + } +} + +extern void EnforceCUDNNLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using miopen_func = decltype(&::__name); \ + std::call_once(miopen_dso_flag, []() { \ + miopen_dso_handle = pten::dynload::GetCUDNNDsoHandle(); \ + }); \ + EnforceCUDNNLoaded(#__name); \ + static void* p_##__name = dlsym(miopen_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed miopen functions in HPPL + **/ +#define MIOPEN_DNN_ROUTINE_EACH(__macro) \ + __macro(miopenGetVersion); \ + __macro(miopenOpTensor); \ + __macro(miopenSet4dTensorDescriptor); \ + __macro(miopenSetTensorDescriptor); \ + __macro(miopenInitConvolutionNdDescriptor); \ + __macro(miopenFindConvolutionForwardAlgorithm); \ + __macro(miopenGetConvolutionNdForwardOutputDim); \ + __macro(miopenFindConvolutionBackwardDataAlgorithm); \ + __macro(miopenFindConvolutionBackwardWeightsAlgorithm); \ + __macro(miopenGetTensorDescriptor); \ + __macro(miopenCreateTensorDescriptor); \ + __macro(miopenDestroyTensorDescriptor); \ + __macro(miopenGetTensorDescriptorSize); \ + __macro(miopenSet2dPoolingDescriptor); \ + __macro(miopenGet2dPoolingDescriptor); \ + __macro(miopenGetPoolingNdForwardOutputDim); \ + __macro(miopenCreateConvolutionDescriptor); \ + __macro(miopenCreatePoolingDescriptor); \ + __macro(miopenDestroyPoolingDescriptor); \ + __macro(miopenPoolingGetWorkSpaceSize); \ + __macro(miopenPoolingGetWorkSpaceSizeV2); \ + __macro(miopenSetNdPoolingDescriptor); \ + __macro(miopenInitConvolutionDescriptor); \ + __macro(miopenDestroyConvolutionDescriptor); \ + __macro(miopenGetConvolutionNdDescriptor); \ + __macro(miopenDeriveBNTensorDescriptor); \ + __macro(miopenCreate); \ + __macro(miopenDestroy); \ + __macro(miopenSetStream); \ + __macro(miopenActivationForward); \ + __macro(miopenActivationBackward); \ + __macro(miopenConvolutionBackwardWeights); \ + __macro(miopenConvolutionForward); \ + __macro(miopenConvolutionForwardBias); \ + __macro(miopenConvolutionBackwardBias); \ + __macro(miopenConvolutionForwardGetWorkSpaceSize); \ + __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \ + __macro(miopenTransformTensor); \ + __macro(miopenPoolingForward); \ + __macro(miopenPoolingBackward); \ + __macro(miopenSoftmaxBackward); \ + __macro(miopenSoftmaxBackward_V2); \ + __macro(miopenSoftmaxForward); \ + __macro(miopenSoftmaxForward_V2); \ + __macro(miopenCreateDropoutDescriptor); \ + __macro(miopenDestroyDropoutDescriptor); \ + __macro(miopenRestoreDropoutDescriptor); \ + __macro(miopenDropoutGetStatesSize); \ + __macro(miopenSetDropoutDescriptor); \ + __macro(miopenCreateRNNDescriptor); \ + __macro(miopenDestroyRNNDescriptor); \ + __macro(miopenSetRNNDescriptor); \ + __macro(miopenSetRNNDescriptor_V2); \ + __macro(miopenGetRNNParamsSize); \ + __macro(miopenGetRNNWorkspaceSize); \ + __macro(miopenGetRNNTrainingReserveSize); \ + __macro(miopenRNNForwardTraining); \ + __macro(miopenRNNBackwardData); \ + __macro(miopenRNNBackwardWeights); \ + __macro(miopenRNNForwardInference); \ + __macro(miopenGetTensorNumBytes); + +MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) + +#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \ + __macro(miopenConvolutionBackwardData); +MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) + +// APIs available after R3: +#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ + __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize); +MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) + +// APIs available after R4: +#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ + __macro(miopenBatchNormalizationForwardTraining); \ + __macro(miopenBatchNormalizationForwardInference); \ + __macro(miopenBatchNormalizationBackward); +MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) + +// APIs in R5 +#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \ + __macro(miopenCreateActivationDescriptor); \ + __macro(miopenSetActivationDescriptor); \ + __macro(miopenGetActivationDescriptor); \ + __macro(miopenDestroyActivationDescriptor); +MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) + +// APIs in R6 +#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \ +/*__macro(miopenSetRNNDescriptor_v6);*/ +MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) + +#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \ + __macro(miopenSetConvolutionGroupCount); \ + __macro(miopenCreateCTCLossDescriptor); \ + __macro(miopenDestroyCTCLossDescriptor); \ + __macro(miopenGetCTCLossDescriptor); \ + __macro(miopenSetCTCLossDescriptor); \ + __macro(miopenGetCTCLossWorkspaceSize); \ + __macro(miopenCTCLoss); +MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) + +#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \ +/*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \ +__macro(cudnnBatchNormalizationForwardTrainingEx); \ +__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ +__macro(cudnnBatchNormalizationBackwardEx); \ +__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/ +MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/mklml.cc b/paddle/pten/backends/dynload/mklml.cc new file mode 100644 index 00000000000..dfa1491f027 --- /dev/null +++ b/paddle/pten/backends/dynload/mklml.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/mklml.h" + +namespace pten { +namespace dynload { + +std::once_flag mklml_dso_flag; +void* mklml_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MKLML_ROUTINE_EACH(DEFINE_WRAP); + +#if !defined(_WIN32) +DEFINE_WRAP(mkl_scsrmm); +DEFINE_WRAP(mkl_dcsrmm); +#endif + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/mklml.h b/paddle/pten/backends/dynload/mklml.h new file mode 100644 index 00000000000..a8baca6aecd --- /dev/null +++ b/paddle/pten/backends/dynload/mklml.h @@ -0,0 +1,123 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag mklml_dso_flag; +extern void *mklml_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mklml routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using mklmlFunc = decltype(&::__name); \ + std::call_once(mklml_dso_flag, []() { \ + mklml_dso_handle = pten::dynload::GetMKLMLDsoHandle(); \ + }); \ + static void *p_##_name = dlsym(mklml_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name) + +#define MKLML_ROUTINE_EACH(__macro) \ + __macro(cblas_sgemm); \ + __macro(cblas_dgemm); \ + __macro(cblas_cgemm); \ + __macro(cblas_zgemm); \ + __macro(cblas_saxpy); \ + __macro(cblas_daxpy); \ + __macro(cblas_caxpy); \ + __macro(cblas_zaxpy); \ + __macro(cblas_scopy); \ + __macro(cblas_dcopy); \ + __macro(cblas_ccopy); \ + __macro(cblas_zcopy); \ + __macro(cblas_sgemv); \ + __macro(cblas_dgemv); \ + __macro(cblas_cgemv); \ + __macro(cblas_zgemv); \ + __macro(cblas_strsm); \ + __macro(cblas_dtrsm); \ + __macro(cblas_ctrsm); \ + __macro(cblas_ztrsm); \ + __macro(cblas_sgemm_alloc); \ + __macro(cblas_dgemm_alloc); \ + __macro(cblas_sgemm_pack); \ + __macro(cblas_dgemm_pack); \ + __macro(cblas_sgemm_compute); \ + __macro(cblas_dgemm_compute); \ + __macro(cblas_sgemm_free); \ + __macro(cblas_dgemm_free); \ + __macro(cblas_sgemm_batch); \ + __macro(cblas_dgemm_batch); \ + __macro(cblas_cgemm_batch); \ + __macro(cblas_zgemm_batch); \ + __macro(cblas_sdot); \ + __macro(cblas_ddot); \ + __macro(cblas_sasum); \ + __macro(cblas_dasum); \ + __macro(cblas_isamax); \ + __macro(cblas_idamax); \ + __macro(cblas_sscal); \ + __macro(cblas_dscal); \ + __macro(vsAdd); \ + __macro(vdAdd); \ + __macro(vsSub); \ + __macro(vdSub); \ + __macro(vsMul); \ + __macro(vdMul); \ + __macro(vsDiv); \ + __macro(vdDiv); \ + __macro(vsExp); \ + __macro(vdExp); \ + __macro(vsSqr); \ + __macro(vdSqr); \ + __macro(vsPowx); \ + __macro(vdPowx); \ + __macro(vsInv); \ + __macro(vdInv); \ + __macro(vmsErf); \ + __macro(vmdErf); \ + __macro(MKL_Free_Buffers); \ + __macro(MKL_Set_Num_Threads); \ + __macro(MKL_Get_Max_Threads); + +MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); + +#if !defined(_WIN32) +DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm); +DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm); +#endif + +#undef DYNAMIC_LOAD_MKLML_WRAP + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/mklrt.cc b/paddle/pten/backends/dynload/mklrt.cc new file mode 100644 index 00000000000..27c544ff25f --- /dev/null +++ b/paddle/pten/backends/dynload/mklrt.cc @@ -0,0 +1,50 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/mklrt.h" + +namespace pten { +namespace dynload { + +std::once_flag mklrt_dso_flag; +void* mklrt_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MKLDFTI_ROUTINE_EACH(DEFINE_WRAP); + +DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc, + enum DFTI_CONFIG_VALUE prec, + enum DFTI_CONFIG_VALUE domain, + MKL_LONG dim, + MKL_LONG* sizes) { + if (prec == DFTI_SINGLE) { + if (dim == 1) { + return DftiCreateDescriptor_s_1d(desc, domain, sizes[0]); + } else { + return DftiCreateDescriptor_s_md(desc, domain, dim, sizes); + } + } else if (prec == DFTI_DOUBLE) { + if (dim == 1) { + return DftiCreateDescriptor_d_1d(desc, domain, sizes[0]); + } else { + return DftiCreateDescriptor_d_md(desc, domain, dim, sizes); + } + } else { + return DftiCreateDescriptor(desc, prec, domain, dim, sizes); + } +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/mklrt.h b/paddle/pten/backends/dynload/mklrt.h new file mode 100644 index 00000000000..fe87b170a1c --- /dev/null +++ b/paddle/pten/backends/dynload/mklrt.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag mklrt_dso_flag; +extern void* mklrt_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mkldfti routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using mklrtFunc = decltype(&::__name); \ + std::call_once(mklrt_dso_flag, []() { \ + mklrt_dso_handle = pten::dynload::GetMKLRTDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(mklrt_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +// mkl_dfti.h has a macro that shadows the function with the same name +// un-defeine this macro so as to export that function +#undef DftiCreateDescriptor + +#define MKLDFTI_ROUTINE_EACH(__macro) \ + __macro(DftiCreateDescriptor); \ + __macro(DftiCreateDescriptor_s_1d); \ + __macro(DftiCreateDescriptor_d_1d); \ + __macro(DftiCreateDescriptor_s_md); \ + __macro(DftiCreateDescriptor_d_md); \ + __macro(DftiSetValue); \ + __macro(DftiGetValue); \ + __macro(DftiCommitDescriptor); \ + __macro(DftiComputeForward); \ + __macro(DftiComputeBackward); \ + __macro(DftiFreeDescriptor); \ + __macro(DftiErrorClass); \ + __macro(DftiErrorMessage); + +MKLDFTI_ROUTINE_EACH(DYNAMIC_LOAD_MKLRT_WRAP) + +#undef DYNAMIC_LOAD_MKLRT_WRAP + +// define another function to avoid naming conflict +DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc, + enum DFTI_CONFIG_VALUE prec, + enum DFTI_CONFIG_VALUE domain, + MKL_LONG dim, + MKL_LONG* sizes); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/nccl.cc b/paddle/pten/backends/dynload/nccl.cc new file mode 100644 index 00000000000..1d393213a0e --- /dev/null +++ b/paddle/pten/backends/dynload/nccl.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/nccl.h" + +namespace pten { +namespace dynload { + +std::once_flag nccl_dso_flag; +void *nccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +#if NCCL_VERSION_CODE >= 2212 +NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) +#endif + +#if NCCL_VERSION_CODE >= 2304 +NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP) +#endif + +#if NCCL_VERSION_CODE >= 2703 +NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) +#endif + +#if NCCL_VERSION_CODE >= 21100 +NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) +#endif + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/nccl.h b/paddle/pten/backends/dynload/nccl.h new file mode 100644 index 00000000000..85a062fbf08 --- /dev/null +++ b/paddle/pten/backends/dynload/nccl.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag nccl_dso_flag; +extern void* nccl_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using nccl_func = decltype(&::__name); \ + std::call_once(nccl_dso_flag, []() { \ + nccl_dso_handle = pten::dynload::GetNCCLDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(nccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define NCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(ncclCommInitAll); \ + __macro(ncclGetUniqueId); \ + __macro(ncclCommInitRank); \ + __macro(ncclCommDestroy); \ + __macro(ncclCommCount); \ + __macro(ncclCommCuDevice); \ + __macro(ncclCommUserRank); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ + __macro(ncclReduce); \ + __macro(ncclReduceScatter); \ + __macro(ncclGetErrorString); + +NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) + +#if NCCL_VERSION_CODE >= 2212 +#define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast); +NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +#endif + +#if NCCL_VERSION_CODE >= 2304 +#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion); +NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +#endif + +#if NCCL_VERSION_CODE >= 2703 +#define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ + __macro(ncclSend); \ + __macro(ncclRecv); +NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +#endif + +#if NCCL_VERSION_CODE >= 21100 +#define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \ + __macro(ncclRedOpCreatePreMulSum); \ + __macro(ncclRedOpDestroy); +NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) +#endif + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/nvjpeg.cc b/paddle/pten/backends/dynload/nvjpeg.cc new file mode 100644 index 00000000000..ea385e282fc --- /dev/null +++ b/paddle/pten/backends/dynload/nvjpeg.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/nvjpeg.h" + +namespace pten { +namespace dynload { + +std::once_flag nvjpeg_dso_flag; +void *nvjpeg_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/nvjpeg.h b/paddle/pten/backends/dynload/nvjpeg.h new file mode 100644 index 00000000000..9abcfaee6ed --- /dev/null +++ b/paddle/pten/backends/dynload/nvjpeg.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_CUDA +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { +extern std::once_flag nvjpeg_dso_flag; +extern void *nvjpeg_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + nvjpegStatus_t operator()(Args... args) { \ + using nvjpegFunc = decltype(&::__name); \ + std::call_once(nvjpeg_dso_flag, []() { \ + nvjpeg_dso_handle = pten::dynload::GetNvjpegDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ + __macro(nvjpegCreateSimple); \ + __macro(nvjpegJpegStateCreate); \ + __macro(nvjpegGetImageInfo); \ + __macro(nvjpegJpegStateDestroy); \ + __macro(nvjpegDecode); + +NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); + +} // namespace dynload +} // namespace pten + +#endif diff --git a/paddle/pten/backends/dynload/nvrtc.cc b/paddle/pten/backends/dynload/nvrtc.cc new file mode 100644 index 00000000000..e86d943a249 --- /dev/null +++ b/paddle/pten/backends/dynload/nvrtc.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/nvrtc.h" + +namespace pten { +namespace dynload { + +std::once_flag nvrtc_dso_flag; +void* nvrtc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NVRTC_ROUTINE_EACH(DEFINE_WRAP); + +bool HasNVRTC() { + std::call_once(nvrtc_dso_flag, + []() { nvrtc_dso_handle = GetNVRTCDsoHandle(); }); + return nvrtc_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/nvrtc.h b/paddle/pten/backends/dynload/nvrtc.h new file mode 100644 index 00000000000..5dcb1142d58 --- /dev/null +++ b/paddle/pten/backends/dynload/nvrtc.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag nvrtc_dso_flag; +extern void* nvrtc_dso_handle; +extern bool HasNVRTC(); + +#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using nvrtc_func = decltype(&::__name); \ + std::call_once(nvrtc_dso_flag, []() { \ + nvrtc_dso_handle = pten::dynload::GetNVRTCDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(nvrtc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed nvrtc functions + **/ +#define NVRTC_ROUTINE_EACH(__macro) \ + __macro(nvrtcVersion); \ + __macro(nvrtcGetErrorString); \ + __macro(nvrtcCompileProgram); \ + __macro(nvrtcCreateProgram); \ + __macro(nvrtcDestroyProgram); \ + __macro(nvrtcGetPTX); \ + __macro(nvrtcGetPTXSize); \ + __macro(nvrtcGetProgramLog); \ + __macro(nvrtcGetProgramLogSize) + +NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/nvtx.cc b/paddle/pten/backends/dynload/nvtx.cc new file mode 100644 index 00000000000..6d248ff2de0 --- /dev/null +++ b/paddle/pten/backends/dynload/nvtx.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifndef _WIN32 +#include "paddle/pten/backends/dynload/nvtx.h" + +namespace pten { +namespace dynload { + +std::once_flag nvtx_dso_flag; +void *nvtx_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NVTX_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten +#endif diff --git a/paddle/pten/backends/dynload/nvtx.h b/paddle/pten/backends/dynload/nvtx.h new file mode 100644 index 00000000000..98c2d539339 --- /dev/null +++ b/paddle/pten/backends/dynload/nvtx.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#ifndef _WIN32 +#include +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { +extern std::once_flag nvtx_dso_flag; +extern void *nvtx_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + int operator()(Args... args) { \ + using nvtxFunc = decltype(&::__name); \ + std::call_once(nvtx_dso_flag, []() { \ + nvtx_dso_handle = pten::dynload::GetNvtxDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define NVTX_ROUTINE_EACH(__macro) \ + __macro(nvtxRangePushA); \ + __macro(nvtxRangePop); + +NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP +} // namespace dynload +} // namespace pten +#endif diff --git a/paddle/fluid/platform/port.h b/paddle/pten/backends/dynload/port.h similarity index 100% rename from paddle/fluid/platform/port.h rename to paddle/pten/backends/dynload/port.h diff --git a/paddle/pten/backends/dynload/rccl.cc b/paddle/pten/backends/dynload/rccl.cc new file mode 100644 index 00000000000..46bbaea1362 --- /dev/null +++ b/paddle/pten/backends/dynload/rccl.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/rccl.h" + +namespace pten { +namespace dynload { + +std::once_flag rccl_dso_flag; +void *rccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +#if NCCL_VERSION_CODE >= 2212 +RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) +#endif + +#if NCCL_VERSION_CODE >= 2703 +RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) +#endif + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/rccl.h b/paddle/pten/backends/dynload/rccl.h new file mode 100644 index 00000000000..370bab6658f --- /dev/null +++ b/paddle/pten/backends/dynload/rccl.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag rccl_dso_flag; +extern void* rccl_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using nccl_func = decltype(&::__name); \ + std::call_once(rccl_dso_flag, []() { \ + rccl_dso_handle = pten::dynload::GetNCCLDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(rccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define RCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(ncclCommInitAll); \ + __macro(ncclGetUniqueId); \ + __macro(ncclCommInitRank); \ + __macro(ncclCommDestroy); \ + __macro(ncclCommCount); \ + __macro(ncclCommCuDevice); \ + __macro(ncclCommUserRank); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ + __macro(ncclReduce); \ + __macro(ncclReduceScatter); \ + __macro(ncclGetErrorString); + +RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) + +#if NCCL_VERSION_CODE >= 2212 +#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast); +RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) +#endif + +#if NCCL_VERSION_CODE >= 2703 +#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ + __macro(ncclSend); \ + __macro(ncclRecv); +RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) +#endif + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/rocblas.cc b/paddle/pten/backends/dynload/rocblas.cc new file mode 100644 index 00000000000..0b7d4469f38 --- /dev/null +++ b/paddle/pten/backends/dynload/rocblas.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/rocblas.h" + +namespace pten { +namespace dynload { +std::once_flag rocblas_dso_flag; +void *rocblas_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +ROCBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R2 +ROCBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif + +#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R3 +ROCBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); +#endif + +#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R4 +ROCBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/rocblas.h b/paddle/pten/backends/dynload/rocblas.h new file mode 100644 index 00000000000..eb7e6d58e1b --- /dev/null +++ b/paddle/pten/backends/dynload/rocblas.h @@ -0,0 +1,112 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag rocblas_dso_flag; +extern void *rocblas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + rocblas_status operator()(Args... args) { \ + using rocblas_func = decltype(&::__name); \ + std::call_once(rocblas_dso_flag, []() { \ + rocblas_dso_handle = pten::dynload::GetCublasDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(rocblas_caxpy); \ + __macro(rocblas_saxpy); \ + __macro(rocblas_daxpy); \ + __macro(rocblas_zaxpy); \ + __macro(rocblas_sscal); \ + __macro(rocblas_dscal); \ + __macro(rocblas_scopy); \ + __macro(rocblas_dcopy); \ + __macro(rocblas_cgemv); \ + __macro(rocblas_sgemv); \ + __macro(rocblas_zgemv); \ + __macro(rocblas_dgemv); \ + __macro(rocblas_cgemm); \ + __macro(rocblas_sgemm); \ + __macro(rocblas_dgemm); \ + __macro(rocblas_hgemm); \ + __macro(rocblas_zgemm); \ + __macro(rocblas_sgeam); \ + __macro(rocblas_strsm); \ + __macro(rocblas_dtrsm); \ + __macro(rocblas_dgeam); \ + __macro(rocblas_sgemm_batched); \ + __macro(rocblas_dgemm_batched); \ + __macro(rocblas_cgemm_batched); \ + __macro(rocblas_zgemm_batched); \ + __macro(rocblas_create_handle); \ + __macro(rocblas_destroy_handle); \ + __macro(rocblas_set_stream); \ + __macro(rocblas_get_stream); \ + __macro(rocblas_set_pointer_mode); \ + __macro(rocblas_get_pointer_mode); + +ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) + +// APIs available after CUDA 8.0 +#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ + __macro(rocblas_gemm_ex); \ + __macro(rocblas_sgemm_strided_batched); \ + __macro(rocblas_dgemm_strided_batched); \ + __macro(rocblas_cgemm_strided_batched); \ + __macro(rocblas_zgemm_strided_batched); \ + __macro(rocblas_hgemm_strided_batched); + +ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) + +// HIP not supported in ROCM3.5 +// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro) +// __macro(cublasSetMathMode); +// __macro(cublasGetMathMode); +// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) + +#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ + __macro(rocblas_gemm_batched_ex); \ + __macro(rocblas_gemm_strided_batched_ex); + +ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP) + +#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/rocm_driver.cc b/paddle/pten/backends/dynload/rocm_driver.cc new file mode 100644 index 00000000000..51d55e80a72 --- /dev/null +++ b/paddle/pten/backends/dynload/rocm_driver.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/rocm_driver.h" + +namespace pten { +namespace dynload { + +std::once_flag rocm_dso_flag; +void* rocm_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +ROCM_ROUTINE_EACH(DEFINE_WRAP); + +bool HasCUDADriver() { + std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); }); + return rocm_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/rocm_driver.h b/paddle/pten/backends/dynload/rocm_driver.h new file mode 100644 index 00000000000..fcc6b7f037c --- /dev/null +++ b/paddle/pten/backends/dynload/rocm_driver.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" + +namespace pten { +namespace dynload { + +extern std::once_flag rocm_dso_flag; +extern void* rocm_dso_handle; +extern bool HasCUDADriver(); + +#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using rocm_func = decltype(&::__name); \ + std::call_once(rocm_dso_flag, []() { \ + rocm_dso_handle = pten::dynload::GetCUDADsoHandle(); \ + }); \ + static void* p_##__name = dlsym(rocm_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed cuda driver functions + **/ +#define ROCM_ROUTINE_EACH(__macro) \ + __macro(hipDriverGetVersion); \ + __macro(hipGetErrorString); \ + __macro(hipModuleLoadData); \ + __macro(hipModuleGetFunction); \ + __macro(hipModuleUnload); \ + /*rocm3.5 not support the function*/ \ + /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \ + __macro(hipModuleLaunchKernel); \ + __macro(hipLaunchKernel); \ + __macro(hipGetDevice); \ + __macro(hipGetDeviceCount); \ + __macro(hipDevicePrimaryCtxGetState) + +ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/tensorrt.cc b/paddle/pten/backends/dynload/tensorrt.cc new file mode 100644 index 00000000000..680dad52890 --- /dev/null +++ b/paddle/pten/backends/dynload/tensorrt.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/pten/backends/dynload/tensorrt.h" +#include + +namespace pten { +namespace dynload { + +std::once_flag tensorrt_dso_flag; +void* tensorrt_dso_handle; + +std::once_flag tensorrt_plugin_dso_flag; +void* tensorrt_plugin_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP); +TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP); +TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP); + +void* GetDsoHandle(const std::string& dso_name) { +#if !defined(_WIN32) + int dynload_flags = RTLD_LAZY | RTLD_LOCAL; +#else + int dynload_flags = 0; +#endif // !_WIN32 + + void* dso_handle = dlopen(dso_name.c_str(), dynload_flags); + + if (nullptr == dso_handle) { + auto error_msg = + "You are using Paddle compiled with TensorRT, but TensorRT dynamic " + "library is not found. Ignore this if TensorRT is not needed.\n" + "The TensorRT that Paddle depends on is not configured correctly.\n" + " Suggestions:\n" + " 1. Check if the TensorRT is installed correctly and its version" + " is matched with paddlepaddle you installed.\n" + " 2. Configure environment variables as " + "follows:\n" + " - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n" + " - Windows: set PATH by `set PATH=XXX;%PATH%`\n" + " - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...`\n"; + LOG(WARNING) << error_msg; + } + return dso_handle; +} + +void* GetTensorRtHandle() { +#if defined(__APPLE__) || defined(__OSX__) + std::string dso_name = "libnvinfer.dylib"; +#elif defined(_WIN32) + std::string dso_name = "nvinfer.dll"; +#else + std::string dso_name = "libnvinfer.so"; +#endif + return GetDsoHandle(dso_name); +} + +void* GetTensorRtPluginHandle() { +#if defined(__APPLE__) || defined(__OSX__) + std::string dso_name = "libnvinfer_plugin.dylib"; +#elif defined(_WIN32) + std::string dso_name = "nvinfer_plugin.dll"; +#else + std::string dso_name = "libnvinfer_plugin.so"; +#endif + return GetDsoHandle(dso_name); +} + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/tensorrt.h b/paddle/pten/backends/dynload/tensorrt.h new file mode 100644 index 00000000000..ed710085acd --- /dev/null +++ b/paddle/pten/backends/dynload/tensorrt.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#if !defined(_WIN32) +#include +#endif + +#include // NOLINT + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/backends/dynload/dynamic_loader.h" + +namespace pten { +namespace dynload { + +void* GetTensorRtHandle(); + +extern std::once_flag tensorrt_dso_flag; +extern void* tensorrt_dso_handle; + +void* GetTensorRtPluginHandle(); +extern std::once_flag tensorrt_plugin_dso_flag; +extern void* tensorrt_plugin_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + void* operator()(Args... args) { \ + std::call_once(tensorrt_dso_flag, []() { \ + tensorrt_dso_handle = pten::dynload::GetTensorRtHandle(); \ + }); \ + static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ + if (p_##__name == nullptr) { \ + return nullptr; \ + } \ + using tensorrt_func = decltype(&::__name); \ + auto ret = reinterpret_cast(p_##__name)(args...); \ + return static_cast(ret); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + std::call_once(tensorrt_dso_flag, []() { \ + tensorrt_dso_handle = pten::dynload::GetTensorRtHandle(); \ + }); \ + static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ + PADDLE_ENFORCE_NOT_NULL(p_##__name, \ + paddle::platform::errors::Unavailable( \ + "Load tensorrt api %s failed", #__name)); \ + using tensorrt_func = decltype(&::__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + std::call_once(tensorrt_plugin_dso_flag, []() { \ + tensorrt_plugin_dso_handle = pten::dynload::GetTensorRtPluginHandle(); \ + }); \ + static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name); \ + PADDLE_ENFORCE_NOT_NULL(p_##__name, \ + paddle::platform::errors::Unavailable( \ + "Load tensorrt plugin %s failed", #__name)); \ + using tensorrt_plugin_func = decltype(&::__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#ifdef NV_TENSORRT_MAJOR + +#if (NV_TENSORRT_MAJOR >= 6) +#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \ + __macro(createInferBuilder_INTERNAL); \ + __macro(createInferRuntime_INTERNAL); \ + __macro(getPluginRegistry); +#else +#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \ + __macro(createInferBuilder_INTERNAL); \ + __macro(createInferRuntime_INTERNAL); +#endif + +#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \ + __macro(getInferLibVersion); + +#define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \ + __macro(initLibNvInferPlugins); + +TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP) +TENSORRT_RAND_ROUTINE_EACH_NON_POINTER( + DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP) +TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP) + +#endif // end of NV_TENSORRT_MAJOR + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/warpctc.cc b/paddle/pten/backends/dynload/warpctc.cc new file mode 100644 index 00000000000..3c34b016732 --- /dev/null +++ b/paddle/pten/backends/dynload/warpctc.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/backends/dynload/warpctc.h" + +namespace pten { +namespace dynload { + +std::once_flag warpctc_dso_flag; +void* warpctc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +WARPCTC_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace pten diff --git a/paddle/pten/backends/dynload/warpctc.h b/paddle/pten/backends/dynload/warpctc.h new file mode 100644 index 00000000000..2852293a686 --- /dev/null +++ b/paddle/pten/backends/dynload/warpctc.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT + +#include "paddle/pten/backends/dynload/dynamic_loader.h" +#include "paddle/pten/backends/dynload/port.h" +#include "warpctc/include/ctc.h" + +namespace pten { +namespace dynload { + +extern std::once_flag warpctc_dso_flag; +extern void* warpctc_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load warpctc routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using warpctcFunc = decltype(&::__name); \ + std::call_once(warpctc_dso_flag, []() { \ + warpctc_dso_handle = pten::dynload::GetWarpCTCDsoHandle(); \ + }); \ + static void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + DYNAMIC_LOAD_WARPCTC_WRAP(__name) + +#define WARPCTC_ROUTINE_EACH(__macro) \ + __macro(get_warpctc_version); \ + __macro(ctcGetStatusString); \ + __macro(compute_ctc_loss); \ + __macro(compute_ctc_loss_double); \ + __macro(get_workspace_size); \ + __macro(get_workspace_size_double) + +WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP); + +#undef DYNAMIC_LOAD_WARPCTC_WRAP + +} // namespace dynload +} // namespace pten -- GitLab