未验证 提交 3c1dc6f6 编写于 作者: W Wilber 提交者: GitHub

[PTEN] Move dynload from fluid to pten. (#39120)

* move dynload from fluid to pten.

* fix ci compile

* fix windows ci compile.

* update

* update

* fix compile error
上级 11938ae1
......@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace butil {
class IOBuf;
......@@ -78,11 +78,11 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
const framework::Scope* scope);
void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& iobuf,
butil::IOBufBytesIterator& iobuf, // NOLINT
const platform::DeviceContext& ctx);
void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& iobuf,
butil::IOBufBytesIterator& iobuf, // NOLINT
const platform::DeviceContext& ctx);
std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port);
......
......@@ -40,9 +40,9 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace distributed {
......@@ -202,7 +202,7 @@ class ValueBlock {
// value = _alloc.acquire(value_length_);
table[id] = value;
} else {
value = (VALUE *)(void *)(res->second);
value = (VALUE *)(void *)(res->second); // NOLINT
}
return value;
}
......@@ -282,8 +282,8 @@ class ValueBlock {
value->unseen_days_++;
if (value->unseen_days_ >= threshold) {
butil::return_object(iter->second);
//_alloc.release(iter->second);
//_alloc.release(value);
// _alloc.release(iter->second);
// _alloc.release(value);
iter = table.erase(iter);
} else {
++iter;
......
......@@ -38,8 +38,8 @@ limitations under the License. */
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace framework {
......
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "paddle/fluid/framework/io/shell.h"
#include "paddle/fluid/platform/enforce.h"
......
......@@ -34,8 +34,8 @@
#include <utility>
#include <vector>
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/backends/dynload/port.h"
#if defined(__arm__) || defined(__aarch64__) || defined(__ARM_NEON) || \
defined(__ARM_NEON__)
......
......@@ -34,7 +34,7 @@ limitations under the License. */
#include "paddle/fluid/framework/trainer_desc.pb.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace framework {
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace inference {
......
......@@ -28,7 +28,7 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
#ifdef _WIN32
#include <direct.h>
......
......@@ -20,7 +20,7 @@
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace framework {
......
......@@ -31,8 +31,8 @@
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/pten/backends/dynload/port.h"
extern std::string paddle::framework::DataTypeToString(
const framework::proto::VarType::Type type);
......
......@@ -22,8 +22,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/backends/dynload/port.h"
DECLARE_bool(use_mkldnn);
......
......@@ -23,7 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
DECLARE_bool(use_mkldnn);
......
......@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace operators {
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader)
list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc)
......@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND)
if(WITH_ROCM)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
elseif (WITH_ASCEND_CL)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc)
else()
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
endif()
if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml)
endif()
cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader)
cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack)
add_dependencies(dynload_lapack extern_lapack)
# TODO(TJ): add iomp, mkldnn?
if (MKL_FOUND AND WITH_ONEMKL)
message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader)
cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt)
target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
endif()
......@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cublas_dso_flag;
void *cublas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......
......@@ -20,16 +20,12 @@ limitations under the License. */
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cublas.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cublas_dso_flag;
extern void *cublas_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
......@@ -37,19 +33,8 @@ extern void *cublas_dso_handle;
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublas_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublas_dso_flag, []() { \
cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublas_dso_handle, #__name); \
return reinterpret_cast<cublas_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
......@@ -99,7 +84,7 @@ extern void *cublas_dso_handle;
__macro(cublasSgetrsBatched); \
__macro(cublasDgetrsBatched);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 8.0
#if CUDA_VERSION >= 8000
......@@ -111,7 +96,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.0
......@@ -120,7 +105,7 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasSetMathMode); \
__macro(cublasGetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.1
......@@ -129,10 +114,10 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
__macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx);
CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cublasLt_dso_flag;
void *cublasLt_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......
......@@ -19,16 +19,12 @@ limitations under the License. */
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cublasLt.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cublasLt_dso_flag;
extern void *cublasLt_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublasLt routine
......@@ -36,20 +32,8 @@ extern void *cublasLt_dso_handle;
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublasLt_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublasLt_dso_flag, []() { \
cublasLt_dso_handle = \
paddle::platform::dynload::GetCublasLtDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \
return reinterpret_cast<cublasLt_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
// APIs available after CUDA 10.1
......@@ -69,10 +53,10 @@ extern void *cublasLt_dso_handle;
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif
#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/pten/backends/dynload/cuda_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cuda_dso_flag;
void* cuda_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
#if CUDA_VERSION >= 10020
......@@ -28,10 +26,7 @@ CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
#endif
CUDA_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() {
std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
return cuda_dso_handle != nullptr;
}
bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
} // namespace dynload
} // namespace platform
......
......@@ -17,30 +17,17 @@ limitations under the License. */
#include <cuda.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cuda_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cuda_dso_flag;
extern void* cuda_dso_handle;
extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cuda_func = decltype(&::__name); \
std::call_once(cuda_dso_flag, []() { \
cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(cuda_dso_handle, #__name); \
return reinterpret_cast<cuda_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed cuda driver functions
......@@ -72,12 +59,12 @@ extern bool HasCUDADriver();
__macro(cuMemRelease); \
__macro(cuMemAddressFree)
CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
CUDA_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
#endif
CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
CUDA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP
} // namespace dynload
} // namespace platform
......
......@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/backends/dynload/cudnn.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cudnn_dso_flag;
void* cudnn_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......@@ -45,19 +43,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
#endif
bool HasCUDNN() {
std::call_once(cudnn_dso_flag,
[]() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
return cudnn_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cudnn_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load cudnn shared library. Cannot invoke method %s.",
fn_name));
}
bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
} // namespace dynload
} // namespace platform
......
......@@ -18,32 +18,17 @@ limitations under the License. */
#include <glog/logging.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cudnn.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cudnn_dso_flag;
extern void* cudnn_dso_handle;
extern bool HasCUDNN();
extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cudnn_func = decltype(&::__name); \
std::call_once(cudnn_dso_flag, []() { \
cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnn_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed cudnn functions in HPPL
......@@ -127,7 +112,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnGetActivationDescriptor); \
__macro(cudnnDestroyActivationDescriptor); \
__macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
......@@ -135,7 +120,8 @@ CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(
PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7001
......@@ -153,7 +139,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
__macro(cudnnGetConvolutionForwardAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7201
......@@ -166,7 +152,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx);
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7401
......@@ -176,7 +162,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 8000
......@@ -192,7 +178,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnSetFusedOpsConstParamPackAttribute); \
__macro(cudnnSetFusedOpsVariantParamPackAttribute); \
__macro(cudnnMakeFusedOpsPlan);
CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R8(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload
......
......@@ -13,31 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/cufft.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/backends/dynload/cufft.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cufft_dso_flag;
void* cufft_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUFFT() {
std::call_once(cufft_dso_flag,
[]() { cufft_dso_handle = GetCUFFTDsoHandle(); });
return cufft_dso_handle != nullptr;
}
void EnforceCUFFTLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cufft_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load cufft shared library. Cannot invoke method %s.",
fn_name));
}
bool HasCUFFT() { return pten::dynload::HasCUFFT(); }
} // namespace dynload
} // namespace platform
......
......@@ -19,32 +19,17 @@ limitations under the License. */
#include <glog/logging.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cufft.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cufft_dso_flag;
extern void* cufft_dso_handle;
extern bool HasCUFFT();
extern void EnforceCUFFTLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cufft_func = decltype(&::__name); \
std::call_once(cufft_dso_flag, []() { \
cufft_dso_handle = paddle::platform::dynload::GetCUFFTDsoHandle(); \
}); \
EnforceCUFFTLoaded(#__name); \
static void* p_##__name = dlsym(cufft_dso_handle, #__name); \
return reinterpret_cast<cufft_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed cufft functions in HPPL
......@@ -104,7 +89,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
__macro(cufftXtExecDescriptor); \
__macro(cufftXtSetWorkAreaPolicy);
CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
CUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
} // namespace dynload
} // namespace platform
......
......@@ -20,9 +20,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cupti_dso_flag;
void *cupti_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUPTI_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -19,16 +19,12 @@ limitations under the License. */
#include <cupti.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cupti.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cupti_dso_flag;
extern void *cupti_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cupti routine
......@@ -36,18 +32,8 @@ extern void *cupti_dso_handle;
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
using cuptiFunc = decltype(&::__name); \
std::call_once(cupti_dso_flag, []() { \
cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
}); \
static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \
} \
}; \
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define CUPTI_ROUTINE_EACH(__macro) \
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag curand_dso_flag;
void *curand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,27 +16,14 @@ limitations under the License. */
#include <curand.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/curand.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag curand_dso_flag;
extern void *curand_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
curandStatus_t operator()(Args... args) { \
using curandFunc = decltype(&::__name); \
std::call_once(curand_dso_flag, []() { \
curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(curand_dso_handle, #__name); \
return reinterpret_cast<curandFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define CURAND_RAND_ROUTINE_EACH(__macro) \
......@@ -48,7 +35,7 @@ extern void *curand_dso_handle;
__macro(curandGenerateNormal); \
__macro(curandDestroyGenerator);
CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
CURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload
} // namespace platform
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cusolver_dso_flag;
void *cusolver_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -17,28 +17,14 @@ limitations under the License. */
#include <cusolverDn.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cusolver.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cusolver_dso_flag;
extern void *cusolver_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cusolverStatus_t operator()(Args... args) { \
using cusolverFunc = decltype(&::__name); \
std::call_once(cusolver_dso_flag, []() { \
cusolver_dso_handle = \
paddle::platform::dynload::GetCusolverDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
return reinterpret_cast<cusolverFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define CUSOLVER_ROUTINE_EACH(__macro) \
......@@ -62,7 +48,7 @@ extern void *cusolver_dso_handle;
__macro(cusolverDnCheevd); \
__macro(cusolverDnZheevd);
CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
CUSOLVER_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
#if CUDA_VERSION >= 9020
#define CUSOLVER_ROUTINE_EACH_R1(__macro) \
......@@ -105,7 +91,7 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
__macro(cusolverDnCungqr); \
__macro(cusolverDnZungqr);
CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif
#if CUDA_VERSION >= 9020
......@@ -117,10 +103,10 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
__macro(cusolverDnDsyevj); \
__macro(cusolverDnDestroySyevjInfo);
CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cusparse_dso_flag;
void *cusparse_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
#ifdef CUSPARSE_ROUTINE_EACH
......
......@@ -17,28 +17,14 @@ limitations under the License. */
#include <cusparse.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/cusparse.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cusparse_dso_flag;
extern void *cusparse_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cusparseStatus_t operator()(Args... args) { \
using cusparseFunc = decltype(&::__name); \
std::call_once(cusparse_dso_flag, []() { \
cusparse_dso_handle = \
paddle::platform::dynload::GetCusparseDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
return reinterpret_cast<cusparseFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#if defined(PADDLE_WITH_CUDA)
......@@ -54,7 +40,7 @@ extern void *cusparse_dso_handle;
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);
CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
......@@ -74,7 +60,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
......@@ -83,13 +69,13 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
__macro(cusparseSDDMM_preprocess); \
__macro(cusparseSDDMM);
CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif
#endif
#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -15,556 +15,61 @@ limitations under the License. */
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h"
#if defined(_WIN32)
#include <windows.h>
#endif
DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH");
DEFINE_string(
cuda_dir, "",
"Specify path for loading cuda library, such as libcublas, libcublasLt "
"libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
"If default, dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(nccl_dir, "",
"Specify path for loading nccl library, such as libnccl.so. "
"For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(hccl_dir, "",
"Specify path for loading hccl library, such as libhccl.so. "
"For instance, "
"/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
"default, "
"dlopen will search hccl from LD_LIBRARY_PATH");
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
DEFINE_string(
tensorrt_dir, "",
"Specify path for loading tensorrt library, such as libnvinfer.so.");
DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
DEFINE_string(mkl_dir, "",
"Specify path for loading libmkl_rt.so. "
"For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/."
"If default, "
"dlopen will search mkl from LD_LIBRARY_PATH");
DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
#ifdef PADDLE_WITH_HIP
DEFINE_string(miopen_dir, "",
"Specify path for loading libMIOpen.so. For instance, "
"/opt/rocm/miopen/lib. If empty [default], dlopen "
"will search miopen from LD_LIBRARY_PATH");
DEFINE_string(rocm_dir, "",
"Specify path for loading rocm library, such as librocblas, "
"libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
"If default, dlopen will search rocm from LD_LIBRARY_PATH");
DEFINE_string(rccl_dir, "",
"Specify path for loading rccl library, such as librccl.so. "
"For instance, /opt/rocm/rccl/lib. If default, "
"dlopen will search rccl from LD_LIBRARY_PATH");
#endif
#include "paddle/pten/backends/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
struct PathNode {
PathNode() {}
std::string path = "";
};
static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
// NOTE: In order to adapt to the default installation path of cuda
#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
#else
static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";
#endif
static PathNode s_py_site_pkg_path;
#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
static constexpr char* win_cublas_lib =
"cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
#if CUDA_VERSION >= 11000
static constexpr char* win_curand_lib =
"curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
static constexpr char* win_nvjpeg_lib =
"nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
static constexpr char* win_cusolver_lib =
"cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
static constexpr char* win_cusparse_lib =
"cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
static constexpr char* win_cufft_lib =
"cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll";
#else
static constexpr char* win_curand_lib =
"curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
static constexpr char* win_nvjpeg_lib =
"nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
static constexpr char* win_cusolver_lib =
"cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
static constexpr char* win_cusparse_lib =
"cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
static constexpr char* win_cufft_lib =
"cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
#endif // CUDA_VERSION
#endif
static inline std::string join(const std::string& part1,
const std::string& part2) {
// directory separator
const char sep = '/';
if (!part2.empty() && part2.front() == sep) {
return part2;
}
std::string ret;
ret.reserve(part1.size() + part2.size() + 1);
ret = part1;
if (!ret.empty() && ret.back() != sep) {
ret += sep;
}
ret += part2;
return ret;
}
static inline std::vector<std::string> split(
const std::string& str, const std::string separator = " ") {
std::vector<std::string> str_list;
std::string::size_type firstPos;
firstPos = str.find_first_not_of(separator, 0);
std::string::size_type lastPos;
lastPos = str.find_first_of(separator, firstPos);
while (std::string::npos != firstPos && std::string::npos != lastPos) {
str_list.push_back(str.substr(firstPos, lastPos - firstPos));
firstPos = str.find_first_not_of(separator, lastPos);
lastPos = str.find_first_of(separator, firstPos);
}
if (std::string::npos == lastPos) {
str_list.push_back(str.substr(firstPos, lastPos - firstPos));
}
return str_list;
}
void SetPaddleLibPath(const std::string& py_site_pkg_path) {
s_py_site_pkg_path.path = py_site_pkg_path;
VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
}
static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
const std::string& dso_name,
int dynload_flags) {
void* dso_handle = nullptr;
if (!spec_path.empty()) {
// search xxx.so from custom path
VLOG(3) << "Try to find library: " << dso_name
<< " from specific path: " << spec_path;
std::string dso_path = join(spec_path, dso_name);
dso_handle = dlopen(dso_path.c_str(), dynload_flags);
}
return dso_handle;
}
static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
int dynload_flags) {
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
// and /usr/local/lib path
void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
VLOG(3) << "Try to find library: " << dso_path
<< " from default system path.";
// TODO(chenweihang): This path is used to search which libs?
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
// bring System Integrity Projection (SIP), if dso_handle
// is null, search from default package path in Mac OS.
#if defined(__APPLE__) || defined(__OSX__)
if (nullptr == dso_handle) {
dso_handle =
dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
}
#endif
return dso_handle;
pten::dynload::SetPaddleLibPath(py_site_pkg_path);
}
/*
* We define three priorities for dynamic library search:
*
* First: Search for the path specified by the user
* Second: Search the system default path
* Third: Search for a special path corresponding to
* a specific library to adapt to changes and easy to expand.
*/
static inline void* GetDsoHandleFromSearchPath(
const std::string& config_path, const std::string& dso_name,
bool throw_on_error = true,
const std::vector<std::string>& extra_paths = std::vector<std::string>(),
const std::string& warning_msg = std::string()) {
#if !defined(_WIN32)
int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
#else
int dynload_flags = 0;
#endif // !_WIN32
std::vector<std::string> dso_names = split(dso_name, ";");
void* dso_handle = nullptr;
for (auto dso : dso_names) {
// 1. search in user config path by FLAGS
dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
// 2. search in system default path
if (nullptr == dso_handle) {
dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
}
// 3. search in extra paths
if (nullptr == dso_handle) {
for (auto path : extra_paths) {
VLOG(3) << "extra_paths: " << path;
dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
}
}
if (nullptr != dso_handle) break;
}
void* GetCublasDsoHandle() { return pten::dynload::GetCublasDsoHandle(); }
// 4. [If Failed for All dso_names] logging warning if exists
if (nullptr == dso_handle && !warning_msg.empty()) {
LOG(WARNING) << warning_msg;
}
void* GetCublasLtDsoHandle() { return pten::dynload::GetCublasLtDsoHandle(); }
// 5. [If Failed for All dso_names] logging or throw error info
if (nullptr == dso_handle) {
auto error_msg =
"The third-party dynamic library (%s) that Paddle depends on is not "
"configured correctly. (error code is %s)\n"
" Suggestions:\n"
" 1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
"is installed correctly and its version is matched with paddlepaddle "
"you installed.\n"
" 2. Configure third-party dynamic library environment variables as "
"follows:\n"
" - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
" - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
" - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
"[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
"impossible unless System Integrity Protection (SIP) is disabled.]";
#if !defined(_WIN32)
auto errorno = dlerror();
#else
auto errorno = GetLastError();
#endif // !_WIN32
if (throw_on_error) {
// NOTE: Special error report case, no need to change its format
PADDLE_THROW(
platform::errors::PreconditionNotMet(error_msg, dso_name, errorno));
} else {
LOG(WARNING) << string::Sprintf(error_msg, dso_name, errorno);
}
}
void* GetCUDNNDsoHandle() { return pten::dynload::GetCUDNNDsoHandle(); }
return dso_handle;
}
void* GetCUPTIDsoHandle() { return pten::dynload::GetCUPTIDsoHandle(); }
void* GetCublasDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true,
{cuda_lib_path});
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
#endif
}
void* GetCublasLtDsoHandle() {
// APIs available after CUDA 10.1
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
#else
std::string warning_msg(
"Your CUDA_VERSION less 10.1, not support CublasLt. "
"If you want to use CublasLt, please upgrade CUDA and rebuild "
"PaddlePaddle.");
return nullptr;
#endif
}
void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
std::string mac_warn_meg(
"Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
"For instance, sudo tar -xzf "
"cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
"chmod a+r /usr/local/cuda/include/cudnn.h "
"/usr/local/cuda/lib/libcudnn*");
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false,
{}, mac_warn_meg);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
std::string win_warn_meg(
"Note: [Recommend] copy cudnn into CUDA installation directory. \n "
"For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
"NVIDIA's official website, \n"
"then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
"Toolkit\\CUDA\\v10.0\n"
"You should do this according to your CUDA installation directory and "
"CUDNN version.");
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true,
{cuda_lib_path}, win_warn_meg);
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
#else
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
{cuda_lib_path});
#endif
}
void* GetCUPTIDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false,
{cupti_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.so", false,
{cupti_lib_path});
#endif
}
void* GetCurandDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true,
{cuda_lib_path});
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
#endif
}
void* GetCurandDsoHandle() { return pten::dynload::GetCurandDsoHandle(); }
#ifdef PADDLE_WITH_HIP
void* GetROCFFTDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so");
#endif
}
void* GetROCFFTDsoHandle() { return pten::dynload::GetROCFFTDsoHandle(); }
#endif
void* GetNvjpegDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_nvjpeg_lib, true,
{cuda_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
#endif
}
void* GetNvjpegDsoHandle() { return pten::dynload::GetNvjpegDsoHandle(); }
void* GetCusolverDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusolver_lib, true,
{cuda_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
#endif
}
void* GetCusolverDsoHandle() { return pten::dynload::GetCusolverDsoHandle(); }
void* GetCusparseDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusparse_lib, true,
{cuda_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
#endif
}
void* GetCusparseDsoHandle() { return pten::dynload::GetCusparseDsoHandle(); }
void* GetNVRTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
#endif
}
void* GetNVRTCDsoHandle() { return pten::dynload::GetNVRTCDsoHandle(); }
void* GetCUDADsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
#elif defined(_WIN32)
char system32_dir[MAX_PATH];
GetSystemDirectory(system32_dir, MAX_PATH);
return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
#endif
}
void* GetCUDADsoHandle() { return pten::dynload::GetCUDADsoHandle(); }
void* GetWarpCTCDsoHandle() {
std::string warpctc_dir = "";
if (!s_py_site_pkg_path.path.empty()) {
warpctc_dir = s_py_site_pkg_path.path;
}
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
#else
return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
#endif
}
void* GetWarpCTCDsoHandle() { return pten::dynload::GetWarpCTCDsoHandle(); }
void* GetNCCLDsoHandle() {
#ifdef PADDLE_WITH_HIP
std::string warning_msg(
"You may need to install 'rccl' from ROCM official website: "
"https://rocmdocs.amd.com/en/latest/Installation_Guide/"
"Installation-Guide.html before install PaddlePaddle.");
#else
std::string warning_msg(
"You may need to install 'nccl2' from NVIDIA official website: "
"https://developer.nvidia.com/nccl/nccl-download"
"before install PaddlePaddle.");
#endif
void* GetNCCLDsoHandle() { return pten::dynload::GetNCCLDsoHandle(); }
void* GetHCCLDsoHandle() { return pten::dynload::GetHCCLDsoHandle(); }
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true, {},
warning_msg);
#else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
warning_msg);
#endif
}
void* GetHCCLDsoHandle() {
std::string warning_msg(
"You may need to install 'hccl2' from Huawei official website: "
"before install PaddlePaddle.");
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
void* GetTensorRtDsoHandle() { return pten::dynload::GetTensorRtDsoHandle(); }
#elif defined(PADDLE_WITH_ASCEND_CL)
return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
warning_msg);
#else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
warning_msg);
#endif
}
void* GetMKLMLDsoHandle() { return pten::dynload::GetMKLMLDsoHandle(); }
void* GetTensorRtDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
#endif
}
void* GetMKLMLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
#endif
}
void* GetLAPACKDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
#endif
}
void* GetLAPACKDsoHandle() { return pten::dynload::GetLAPACKDsoHandle(); }
void* GetOpDsoHandle(const std::string& dso_name) {
return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
return pten::dynload::GetOpDsoHandle(dso_name);
}
void* GetNvtxDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple."));
#elif defined(_WIN32)
PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows."));
#elif !defined(PADDLE_WITH_CUDA)
PADDLE_THROW(
platform::errors::Unimplemented("Nvtx do not support without CUDA."));
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
#endif
}
void* GetNvtxDsoHandle() { return pten::dynload::GetNvtxDsoHandle(); }
void* GetCUFFTDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cufft_lib, true,
{cuda_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
#endif
}
void* GetCUFFTDsoHandle() { return pten::dynload::GetCUFFTDsoHandle(); }
void* GetMKLRTDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
#endif
}
void* GetMKLRTDsoHandle() { return pten::dynload::GetMKLRTDsoHandle(); }
} // namespace dynload
} // namespace platform
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hipfft_dso_flag;
void *hipfft_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -17,8 +17,7 @@ limitations under the License. */
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/hipfft.h"
namespace paddle {
namespace platform {
......@@ -26,18 +25,8 @@ namespace dynload {
extern std::once_flag hipfft_dso_flag;
extern void *hipfft_dso_handle;
#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hipfftFunc = decltype(&::__name); \
std::call_once(hipfft_dso_flag, []() { \
hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
}); \
static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \
return reinterpret_cast<hipfftFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
......@@ -70,53 +59,8 @@ extern void *hipfft_dso_handle;
__macro(hipfftGetVersion); \
__macro(hipfftGetProperty);
HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
HIPFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
inline const char *hipfftGetErrorString(hipfftResult_t status) {
switch (status) {
case HIPFFT_SUCCESS:
return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
case HIPFFT_INVALID_PLAN:
return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
case HIPFFT_ALLOC_FAILED:
return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
"memory.";
case HIPFFT_INVALID_TYPE:
return "'HIPFFT_INVALID_TYPE'. No longer used.";
case HIPFFT_INVALID_VALUE:
return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
"parameter.";
case HIPFFT_INTERNAL_ERROR:
return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
"error.";
case HIPFFT_EXEC_FAILED:
return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
case HIPFFT_SETUP_FAILED:
return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
case HIPFFT_INVALID_SIZE:
return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
case HIPFFT_UNALIGNED_DATA:
return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
case HIPFFT_INCOMPLETE_PARAMETER_LIST:
return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
case HIPFFT_INVALID_DEVICE:
return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
"GPU than plan creation.";
case HIPFFT_PARSE_ERROR:
return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
case HIPFFT_NO_WORKSPACE:
return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
"plan execution.";
case HIPFFT_NOT_IMPLEMENTED:
return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
"functionality for parameters given.";
case HIPFFT_NOT_SUPPORTED:
return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
"parameters given.";
default:
return "HIPFFT_STATUS_UNKNOWN_ERROR";
}
}
} // namespace dynload
} // namespace platform
} // namespace paddle
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hiprand_dso_flag;
void *hiprand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,28 +16,15 @@ limitations under the License. */
#include <hiprand.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/hiprand.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hiprand_dso_flag;
extern void *hiprand_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
hiprandStatus_t operator()(Args... args) { \
using hiprandFunc = decltype(&::__name); \
std::call_once(hiprand_dso_flag, []() { \
hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \
return reinterpret_cast<hiprandFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define HIPRAND_RAND_ROUTINE_EACH(__macro) \
......@@ -49,7 +36,7 @@ extern void *hiprand_dso_handle;
__macro(hiprandGenerateNormal); \
__macro(hiprandDestroyGenerator);
HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
HIPRAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload
} // namespace platform
......
......@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/hiprtc.h"
#include "paddle/pten/backends/dynload/hiprtc.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hiprtc_dso_flag;
void* hiprtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() {
std::call_once(hiprtc_dso_flag,
[]() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
return hiprtc_dso_handle != nullptr;
}
bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
} // namespace dynload
} // namespace platform
......
......@@ -16,30 +16,17 @@ limitations under the License. */
#include <hip/hiprtc.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/hiprtc.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hiprtc_dso_flag;
extern void* hiprtc_dso_handle;
extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hiprtc_func = decltype(&::__name); \
std::call_once(hiprtc_dso_flag, []() { \
hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \
return reinterpret_cast<hiprtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed hiprtc functions
......@@ -55,9 +42,9 @@ extern bool HasNVRTC();
__macro(hiprtcGetProgramLog); \
__macro(hiprtcGetProgramLogSize)
HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
HIPRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
} // namespace dynload
} // namespace platform
......
......@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/lapack.h"
#include <mutex>
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag lapack_dso_flag;
void* lapack_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
LAPACK_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,122 +16,20 @@ limitations under the License. */
#include <complex>
#include <mutex>
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
// Note(zhouwei): because lapack doesn't provide appropriate header file.
// should expose API statement yourself.
// getrf_(For example)
extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
int *info);
extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
int *info);
// evd
extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
int *lda, double *w, std::complex<double> *work,
int *lwork, double *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
int *lda, float *w, std::complex<float> *work,
int *lwork, float *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
double *w, double *work, int *lwork, int *iwork,
int *liwork, int *info);
extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
float *w, float *work, int *lwork, int *iwork,
int *liwork, int *info);
// geev
extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
double *wr, double *wi, double *vl, int *ldvl,
double *vr, int *ldvr, double *work, int *lwork,
int *info);
extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
float *wr, float *wi, float *vl, int *ldvl, float *vr,
int *ldvr, float *work, int *lwork, int *info);
extern "C" void zgeev_(char *jobvl, char *jobvr, int *n,
std::complex<double> *a, int *lda,
std::complex<double> *w, std::complex<double> *vl,
int *ldvl, std::complex<double> *vr, int *ldvr,
std::complex<double> *work, int *lwork, double *rwork,
int *info);
extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
int *lda, std::complex<float> *w,
std::complex<float> *vl, int *ldvl,
std::complex<float> *vr, int *ldvr,
std::complex<float> *work, int *lwork, float *rwork,
int *info);
// gels
extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs, double *a,
int *lda, double *b, int *ldb, double *work, int *lwork,
int *info);
extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs, float *a,
int *lda, float *b, int *ldb, float *work, int *lwork,
int *info);
// gelsd
extern "C" void dgelsd_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, double *s, double *rcond,
int *rank, double *work, int *lwork, int *iwork,
int *info);
extern "C" void sgelsd_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, float *s, float *rcond, int *rank,
float *work, int *lwork, int *iwork, int *info);
// gelsy
extern "C" void dgelsy_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, int *jpvt, double *rcond,
int *rank, double *work, int *lwork, int *info);
extern "C" void sgelsy_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, int *jpvt, float *rcond, int *rank,
float *work, int *lwork, int *info);
// gelss
extern "C" void dgelss_(int *m, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, double *s, double *rcond,
int *rank, double *work, int *lwork, int *info);
extern "C" void sgelss_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
int *ldb, float *s, float *rcond, int *rank,
float *work, int *lwork, int *info);
extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a,
int *lda, std::complex<double> *b, int *ldb, int *info);
extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a,
int *lda, std::complex<float> *b, int *ldb, int *info);
extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda,
double *b, int *ldb, int *info);
extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda,
float *b, int *ldb, int *info);
#include "paddle/pten/backends/dynload/lapack.h"
#include "paddle/pten/common/complex.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag lapack_dso_flag;
extern void *lapack_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load lapack routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using lapackFunc = decltype(&::__name); \
std::call_once(lapack_dso_flag, []() { \
lapack_dso_handle = paddle::platform::dynload::GetLAPACKDsoHandle(); \
}); \
static void *p_##_name = dlsym(lapack_dso_handle, #__name); \
return reinterpret_cast<lapackFunc>(p_##_name)(args...); \
} \
}; \
#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
......
......@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/backends/dynload/cudnn.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag miopen_dso_flag;
void* miopen_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......@@ -50,19 +48,7 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif
bool HasCUDNN() {
std::call_once(miopen_dso_flag,
[]() { miopen_dso_handle = GetCUDNNDsoHandle(); });
return miopen_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
miopen_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load miopen shared library. Cannot invoke method %s.",
fn_name));
}
bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
} // namespace dynload
} // namespace platform
......
......@@ -18,66 +18,17 @@ limitations under the License. */
#include <miopen/miopen.h>
#include <miopen/version.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#define MIOPEN_VERSION \
(MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
MIOPEN_VERSION_PATCH) // NOLINT
// MIOPEN only support NCHW, just for compatibility with CUDNN API
typedef enum {
MIOPEN_TENSOR_NCHW = 0,
MIOPEN_TENSOR_NHWC = 1,
} miopenTensorFormat_t;
#include "paddle/pten/backends/dynload/miopen.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag miopen_dso_flag;
extern void* miopen_dso_handle;
extern bool HasCUDNN();
inline const char* miopenGetErrorString(miopenStatus_t status) {
switch (status) {
case miopenStatusSuccess:
return "MIOPEN_STATUS_SUCCESS";
case miopenStatusNotInitialized:
return "MIOPEN_STATUS_NOT_INITIALIZED";
case miopenStatusInvalidValue:
return "MIOPEN_STATUS_INVALID_VALUE";
case miopenStatusBadParm:
return "MIOPEN_STATUS_BAD_PARAM";
case miopenStatusAllocFailed:
return "MIOPEN_STATUS_ALLOC_FAILED";
case miopenStatusInternalError:
return "MIOPEN_STATUS_INTERNAL_ERROR";
case miopenStatusNotImplemented:
return "MIOPEN_STATUS_NOT_IMPLEMENTED";
case miopenStatusUnsupportedOp:
return "MIOPEN_STATUS_UNSUPPORTED_OP";
case miopenStatusUnknownError:
default:
return "MIOPEN_STATUS_UNKNOWN_ERROR";
}
}
extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using miopen_func = decltype(&::__name); \
std::call_once(miopen_dso_flag, []() { \
miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(miopen_dso_handle, #__name); \
return reinterpret_cast<miopen_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed miopen functions in HPPL
......@@ -145,23 +96,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(miopenRNNForwardInference); \
__macro(miopenGetTensorNumBytes);
MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
__macro(miopenConvolutionBackwardData);
MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R3:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R4:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
__macro(miopenBatchNormalizationForwardTraining); \
__macro(miopenBatchNormalizationForwardInference); \
__macro(miopenBatchNormalizationBackward);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R5
#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \
......@@ -169,12 +120,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
__macro(miopenSetActivationDescriptor); \
__macro(miopenGetActivationDescriptor); \
__macro(miopenDestroyActivationDescriptor);
MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_R5(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R6
#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
/*__macro(miopenSetRNNDescriptor_v6);*/
MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_R6(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(miopenSetConvolutionGroupCount); \
......@@ -184,7 +135,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
__macro(miopenSetCTCLossDescriptor); \
__macro(miopenGetCTCLossWorkspaceSize); \
__macro(miopenCTCLoss);
MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
/*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
......@@ -192,7 +143,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag mklml_dso_flag;
void* mklml_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
MKLML_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -17,36 +17,23 @@ limitations under the License. */
#include <mkl.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/mklml.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag mklml_dso_flag;
extern void *mklml_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load mklml routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklmlFunc = decltype(&::__name); \
std::call_once(mklml_dso_flag, []() { \
mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
}); \
static void *p_##_name = dlsym(mklml_dso_handle, #__name); \
return reinterpret_cast<mklmlFunc>(p_##_name)(args...); \
} \
}; \
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
DYNAMIC_LOAD_MKLML_WRAP(__name)
#define MKLML_ROUTINE_EACH(__macro) \
__macro(cblas_sgemm); \
......@@ -111,7 +98,7 @@ extern void *mklml_dso_handle;
__macro(MKL_Set_Num_Threads); \
__macro(MKL_Get_Max_Threads);
MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
#if !defined(_WIN32)
DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace platform {
......@@ -32,18 +32,8 @@ extern void* mklrt_dso_handle;
* (for each function) to dynamic load mkldfti routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklrtFunc = decltype(&::__name); \
std::call_once(mklrt_dso_flag, []() { \
mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \
}); \
static void* p_##__name = dlsym(mklrt_dso_handle, #__name); \
return reinterpret_cast<mklrtFunc>(p_##__name)(args...); \
} \
}; \
#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
// mkl_dfti.h has a macro that shadows the function with the same name
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nccl_dso_flag;
void *nccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,28 +16,14 @@ limitations under the License. */
#include <nccl.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/nccl.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(nccl_dso_flag, []() { \
nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define NCCL_RAND_ROUTINE_EACH(__macro) \
......@@ -57,30 +43,30 @@ extern void* nccl_dso_handle;
__macro(ncclReduceScatter); \
__macro(ncclGetErrorString);
NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212
#define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2304
#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
#define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \
__macro(ncclRecv);
NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 21100
#define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
__macro(ncclRedOpCreatePreMulSum); \
__macro(ncclRedOpDestroy);
NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
} // namespace dynload
......
......@@ -15,9 +15,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nvjpeg_dso_flag;
void *nvjpeg_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -14,27 +14,14 @@ limitations under the License. */
#include <nvjpeg.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/nvjpeg.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nvjpeg_dso_flag;
extern void *nvjpeg_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
nvjpegStatus_t operator()(Args... args) { \
using nvjpegFunc = decltype(&::__name); \
std::call_once(nvjpeg_dso_flag, []() { \
nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \
return reinterpret_cast<nvjpegFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define NVJPEG_RAND_ROUTINE_EACH(__macro) \
......@@ -44,7 +31,7 @@ extern void *nvjpeg_dso_handle;
__macro(nvjpegJpegStateDestroy); \
__macro(nvjpegDecode);
NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
NVJPEG_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
} // namespace dynload
} // namespace platform
......
......@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/nvrtc.h"
#include "paddle/pten/backends/dynload/nvrtc.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nvrtc_dso_flag;
void* nvrtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() {
std::call_once(nvrtc_dso_flag,
[]() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
return nvrtc_dso_handle != nullptr;
}
bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
} // namespace dynload
} // namespace platform
......
......@@ -17,30 +17,17 @@ limitations under the License. */
#include <nvrtc.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/nvrtc.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nvrtc_dso_flag;
extern void* nvrtc_dso_handle;
extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using nvrtc_func = decltype(&::__name); \
std::call_once(nvrtc_dso_flag, []() { \
nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(nvrtc_dso_handle, #__name); \
return reinterpret_cast<nvrtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed nvrtc functions
......@@ -56,9 +43,9 @@ extern bool HasNVRTC();
__macro(nvrtcGetProgramLog); \
__macro(nvrtcGetProgramLogSize)
NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
NVRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
} // namespace dynload
} // namespace platform
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nvtx_dso_flag;
void *nvtx_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVTX_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -17,36 +17,23 @@ limitations under the License. */
#include <nvToolsExt.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/nvtx.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nvtx_dso_flag;
extern void *nvtx_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
int operator()(Args... args) { \
using nvtxFunc = decltype(&::__name); \
std::call_once(nvtx_dso_flag, []() { \
nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \
return reinterpret_cast<nvtxFunc>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \
__macro(nvtxRangePop);
NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rccl_dso_flag;
void *rccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,28 +16,14 @@ limitations under the License. */
#include <rccl.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/rccl.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rccl_dso_flag;
extern void* rccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(rccl_dso_flag, []() { \
rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(rccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define RCCL_RAND_ROUTINE_EACH(__macro) \
......@@ -57,18 +43,18 @@ extern void* rccl_dso_handle;
__macro(ncclReduceScatter); \
__macro(ncclGetErrorString);
RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
RCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212
#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \
__macro(ncclRecv);
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
RCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
} // namespace dynload
......
......@@ -17,8 +17,6 @@ limitations under the License. */
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rocblas_dso_flag;
void *rocblas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
......
......@@ -19,16 +19,12 @@ limitations under the License. */
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/rocblas.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rocblas_dso_flag;
extern void *rocblas_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
......@@ -36,18 +32,8 @@ extern void *rocblas_dso_handle;
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
rocblas_status operator()(Args... args) { \
using rocblas_func = decltype(&::__name); \
std::call_once(rocblas_dso_flag, []() { \
rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \
return reinterpret_cast<rocblas_func>(p_##__name)(args...); \
} \
}; \
#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
......@@ -83,7 +69,7 @@ extern void *rocblas_dso_handle;
__macro(rocblas_set_pointer_mode); \
__macro(rocblas_get_pointer_mode);
ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
ROCBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// APIs available after CUDA 8.0
#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
......@@ -94,21 +80,21 @@ ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
__macro(rocblas_zgemm_strided_batched); \
__macro(rocblas_hgemm_strided_batched);
ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
ROCBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// HIP not supported in ROCM3.5
// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
// __macro(cublasSetMathMode);
// __macro(cublasGetMathMode);
// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// ROCBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(rocblas_gemm_batched_ex); \
__macro(rocblas_gemm_strided_batched_ex);
ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
ROCBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -13,22 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/rocm_driver.h"
#include "paddle/pten/backends/dynload/rocm_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rocm_dso_flag;
void* rocm_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
ROCM_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() {
std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
return rocm_dso_handle != nullptr;
}
bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
} // namespace dynload
} // namespace platform
......
......@@ -17,30 +17,17 @@ limitations under the License. */
#include <hip/hip_runtime.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/rocm_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rocm_dso_flag;
extern void* rocm_dso_handle;
extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using rocm_func = decltype(&::__name); \
std::call_once(rocm_dso_flag, []() { \
rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(rocm_dso_handle, #__name); \
return reinterpret_cast<rocm_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
/**
* include all needed cuda driver functions
......@@ -59,9 +46,9 @@ extern bool HasCUDADriver();
__macro(hipGetDeviceCount); \
__macro(hipDevicePrimaryCtxGetState)
ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP
} // namespace dynload
} // namespace platform
......
......@@ -18,9 +18,6 @@ namespace paddle {
namespace platform {
namespace dynload {
std::once_flag warpctc_dso_flag;
void* warpctc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
......
......@@ -16,34 +16,19 @@ limitations under the License. */
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "warpctc/include/ctc.h"
#include "paddle/pten/backends/dynload/warpctc.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag warpctc_dso_flag;
extern void* warpctc_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load warpctc routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using warpctcFunc = decltype(&::__name); \
std::call_once(warpctc_dso_flag, []() { \
warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
}); \
static void* p_##_name = dlsym(warpctc_dso_handle, #__name); \
return reinterpret_cast<warpctcFunc>(p_##_name)(args...); \
} \
}; \
#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
......
......@@ -65,30 +65,30 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/variant.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/to_string.h"
#include "paddle/pten/backends/dynload/port.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/curand.h"
#include "paddle/fluid/platform/dynload/cusolver.h"
#include "paddle/pten/backends/dynload/cublas.h"
#include "paddle/pten/backends/dynload/cudnn.h"
#include "paddle/pten/backends/dynload/curand.h"
#include "paddle/pten/backends/dynload/cusolver.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include <error.h>
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/pten/backends/dynload/nccl.h"
#endif // __APPLE__
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hipfft.h"
#include "paddle/fluid/platform/dynload/hiprand.h"
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/dynload/hipfft.h"
#include "paddle/pten/backends/dynload/hiprand.h"
#include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/pten/backends/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include <error.h> // NOLINT
#include "paddle/fluid/platform/dynload/rccl.h"
#include "paddle/pten/backends/dynload/rccl.h"
#endif // __APPLE__
#endif // PADDLE_WITH_HIP
......@@ -880,7 +880,7 @@ inline bool is_error(cudnnStatus_t stat) {
inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
std::ostringstream sout;
sout << "CUDNN error(" << stat << "), "
<< platform::dynload::cudnnGetErrorString(stat) << ". "
<< pten::dynload::cudnnGetErrorString(stat) << ". "
<< GetExternalErrorMsg(stat);
return sout.str();
}
......@@ -945,7 +945,7 @@ inline bool is_error(ncclResult_t nccl_result) {
inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
std::ostringstream sout;
sout << "NCCL error(" << nccl_result << "), "
<< platform::dynload::ncclGetErrorString(nccl_result) << ". ";
<< pten::dynload::ncclGetErrorString(nccl_result) << ". ";
if (errno == ENOSPC || errno == EAGAIN) {
std::string detail(strerror(errno));
detail += "\nPlease try one of the following solutions:";
......@@ -1090,7 +1090,7 @@ inline bool is_error(miopenStatus_t stat) {
inline std::string build_rocm_error_msg(miopenStatus_t stat) {
std::string msg(" Miopen error, ");
return msg + platform::dynload::miopenGetErrorString(stat) + " ";
return msg + pten::dynload::miopenGetErrorString(stat) + " ";
}
/***** ROCBLAS ERROR *****/
......@@ -1132,7 +1132,7 @@ inline bool is_error(ncclResult_t nccl_result) {
inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
std::string msg(" Rccl error, ");
return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
return msg + pten::dynload::ncclGetErrorString(nccl_result) + " ";
}
#endif // not(__APPLE__) and PADDLE_WITH_NCCL
......@@ -1141,7 +1141,7 @@ inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; }
inline std::string build_rocm_error_msg(hipfftResult_t stat) {
std::string msg(" HIPFFT error, ");
return msg + platform::dynload::hipfftGetErrorString(stat) + " ";
return msg + pten::dynload::hipfftGetErrorString(stat) + " ";
}
namespace details {
......
......@@ -24,6 +24,8 @@ limitations under the License. */
#include <unistd.h>
#elif defined(_MSC_VER)
#include <processthreadsapi.h>
#else
#include <unistd.h>
#endif
#include "paddle/fluid/platform/macros.h" // import DISABLE_COPY_AND_ASSIGN
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#ifdef _POSIX_C_SOURCE
#include <time.h>
#endif
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
namespace paddle {
namespace platform {
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <stdlib.h>
#include "paddle/fluid/platform/port.h"
#include "paddle/pten/backends/dynload/port.h"
#ifdef _WIN32
static unsigned sleep(unsigned seconds) {
......
add_subdirectory(dynload)
add_subdirectory(cpu)
cc_library(pten_context SRCS all_context.cc DEPS device_context)
cc_library(pten_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc)
if (NOT WITH_NV_JETSON)
list(APPEND CUDA_SRCS nvjpeg.cc)
endif()
if (WITH_ROCM)
list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
endif()
# There is no macOS version of NCCL.
# Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
if (NOT APPLE)
list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
if (WITH_NCCL)
list(APPEND CUDA_SRCS nccl.cc)
endif()
if (WITH_ROCM)
list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
if (WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc)
endif()
endif()
endif()
if (TENSORRT_FOUND)
list(APPEND CUDA_SRCS tensorrt.cc)
endif()
configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND)
if(WITH_ROCM)
hip_library(pten_dynload_cuda SRCS ${HIP_SRCS} DEPS pten_dynamic_loader)
cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
elseif (WITH_ASCEND_CL)
cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc npu_hccl)
else()
nv_library(pten_dynload_cuda SRCS ${CUDA_SRCS} DEPS pten_dynamic_loader)
cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
endif()
if (WITH_MKLML)
cc_library(pten_dynload_mklml SRCS mklml.cc DEPS pten_dynamic_loader mklml)
endif()
cc_library(pten_dynload_lapack SRCS lapack.cc DEPS pten_dynamic_loader)
add_dependencies(pten_dynload_lapack extern_lapack)
# TODO(TJ): add iomp, mkldnn?
if (MKL_FOUND AND WITH_ONEMKL)
message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
cc_library(pten_dynload_mklrt SRCS mklrt.cc DEPS pten_dynamic_loader)
target_include_directories(pten_dynload_mklrt PRIVATE ${MKL_INCLUDE})
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/cublas.h"
namespace pten {
namespace dynload {
std::once_flag cublas_dso_flag;
void *cublas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cublasXt.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag cublas_dso_flag;
extern void *cublas_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublas_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublas_dso_flag, []() { \
cublas_dso_handle = pten::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublas_dso_handle, #__name); \
return reinterpret_cast<cublas_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSaxpy_v2); \
__macro(cublasDaxpy_v2); \
__macro(cublasCaxpy_v2); \
__macro(cublasZaxpy_v2); \
__macro(cublasSscal_v2); \
__macro(cublasDscal_v2); \
__macro(cublasScopy_v2); \
__macro(cublasDcopy_v2); \
__macro(cublasSgemv_v2); \
__macro(cublasDgemv_v2); \
__macro(cublasCgemv_v2); \
__macro(cublasZgemv_v2); \
__macro(cublasSgemm_v2); \
__macro(cublasDgemm_v2); \
__macro(cublasCgemm_v2); \
__macro(cublasZgemm_v2); \
__macro(cublasHgemm); \
__macro(cublasSgemmEx); \
__macro(cublasSgeam); \
__macro(cublasDgeam); \
__macro(cublasStrsm_v2); \
__macro(cublasDtrsm_v2); \
__macro(cublasCtrsm_v2); \
__macro(cublasZtrsm_v2); \
__macro(cublasCreate_v2); \
__macro(cublasDestroy_v2); \
__macro(cublasSetStream_v2); \
__macro(cublasSetPointerMode_v2); \
__macro(cublasGetPointerMode_v2); \
__macro(cublasSgemmBatched); \
__macro(cublasDgemmBatched); \
__macro(cublasCgemmBatched); \
__macro(cublasZgemmBatched); \
__macro(cublasStrsmBatched); \
__macro(cublasDtrsmBatched); \
__macro(cublasCtrsmBatched); \
__macro(cublasZtrsmBatched); \
__macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \
__macro(cublasDgetriBatched); \
__macro(cublasSmatinvBatched); \
__macro(cublasDmatinvBatched); \
__macro(cublasSgetrsBatched); \
__macro(cublasDgetrsBatched);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 8.0
#if CUDA_VERSION >= 8000
#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
__macro(cublasGemmEx); \
__macro(cublasSgemmStridedBatched); \
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000
#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
__macro(cublasSetMathMode); \
__macro(cublasGetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.1
#if CUDA_VERSION >= 9010
#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx);
CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/cublasLt.h"
namespace pten {
namespace dynload {
std::once_flag cublasLt_dso_flag;
void *cublasLt_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cublasLt.h>
#include <cuda.h>
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag cublasLt_dso_flag;
extern void *cublasLt_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublasLt routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublasLt_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublasLt_dso_flag, []() { \
cublasLt_dso_handle = pten::dynload::GetCublasLtDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \
return reinterpret_cast<cublasLt_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
// APIs available after CUDA 10.1
// #if CUDA_VERSION >= 10100
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasLtCreate); \
__macro(cublasLtDestroy); \
__macro(cublasLtMatmul); \
__macro(cublasLtMatmulDescCreate); \
__macro(cublasLtMatmulDescDestroy); \
__macro(cublasLtMatmulDescSetAttribute); \
__macro(cublasLtMatrixLayoutCreate); \
__macro(cublasLtMatrixLayoutDestroy); \
__macro(cublasLtMatrixLayoutSetAttribute); \
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif
#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/cuda_driver.h"
namespace pten {
namespace dynload {
std::once_flag cuda_dso_flag;
void* cuda_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
#if CUDA_VERSION >= 10020
CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
#endif
CUDA_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() {
std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
return cuda_dso_handle != nullptr;
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag cuda_dso_flag;
extern void* cuda_dso_handle;
extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cuda_func = decltype(&::__name); \
std::call_once(cuda_dso_flag, []() { \
cuda_dso_handle = pten::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(cuda_dso_handle, #__name); \
return reinterpret_cast<cuda_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed cuda driver functions
**/
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cuInit); \
__macro(cuDriverGetVersion); \
__macro(cuGetErrorString); \
__macro(cuModuleLoadData); \
__macro(cuModuleGetFunction); \
__macro(cuModuleUnload); \
__macro(cuOccupancyMaxActiveBlocksPerMultiprocessor); \
__macro(cuLaunchKernel); \
__macro(cuCtxCreate); \
__macro(cuCtxGetCurrent); \
__macro(cuDeviceGetCount); \
__macro(cuDevicePrimaryCtxGetState); \
__macro(cuDeviceGetAttribute); \
__macro(cuDeviceGet)
#if CUDA_VERSION >= 10020
#define CUDA_ROUTINE_EACH_VVM(__macro) \
__macro(cuMemGetAllocationGranularity); \
__macro(cuMemAddressReserve); \
__macro(cuMemCreate); \
__macro(cuMemMap); \
__macro(cuMemSetAccess); \
__macro(cuMemUnmap); \
__macro(cuMemRelease); \
__macro(cuMemAddressFree)
CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
#endif
CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
namespace pten {
namespace dynload {
std::once_flag cudnn_dso_flag;
void* cudnn_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R7
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R8
CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
#endif
bool HasCUDNN() {
std::call_once(cudnn_dso_flag,
[]() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
return cudnn_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cudnn_dso_handle,
paddle::platform::errors::PreconditionNotMet(
"Cannot load cudnn shared library. Cannot invoke method %s.",
fn_name));
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag cudnn_dso_flag;
extern void* cudnn_dso_handle;
extern bool HasCUDNN();
extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cudnn_func = decltype(&::__name); \
std::call_once(cudnn_dso_flag, []() { \
cudnn_dso_handle = pten::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnn_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed cudnn functions in HPPL
* different cudnn version has different interfaces
**/
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreateSpatialTransformerDescriptor); \
__macro(cudnnSetSpatialTransformerNdDescriptor); \
__macro(cudnnDestroySpatialTransformerDescriptor); \
__macro(cudnnSpatialTfGridGeneratorForward); \
__macro(cudnnSpatialTfGridGeneratorBackward); \
__macro(cudnnSpatialTfSamplerForward); \
__macro(cudnnSpatialTfSamplerBackward); \
__macro(cudnnCreate); \
__macro(cudnnDestroy); \
__macro(cudnnSetStream); \
__macro(cudnnActivationForward); \
__macro(cudnnActivationBackward); \
__macro(cudnnConvolutionForward); \
__macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithm); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString); \
__macro(cudnnCreateDropoutDescriptor); \
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnRestoreDropoutDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNTrainingReserveSize); \
__macro(cudnnRNNForwardTraining); \
__macro(cudnnRNNBackwardData); \
__macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \
__macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \
__macro(cudnnSetTensorNdDescriptorEx); \
__macro(cudnnAddTensor); \
__macro(cudnnConvolutionBackwardData); \
__macro(cudnnConvolutionBackwardFilter); \
__macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
__macro(cudnnGetConvolutionBackwardDataWorkspaceSize); \
__macro(cudnnBatchNormalizationForwardTraining); \
__macro(cudnnBatchNormalizationForwardInference); \
__macro(cudnnBatchNormalizationBackward); \
__macro(cudnnCreateActivationDescriptor); \
__macro(cudnnSetActivationDescriptor); \
__macro(cudnnGetActivationDescriptor); \
__macro(cudnnDestroyActivationDescriptor); \
__macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount); \
__macro(cudnnSetConvolutionMathType); \
__macro(cudnnConvolutionBiasActivationForward); \
__macro(cudnnCreateCTCLossDescriptor); \
__macro(cudnnDestroyCTCLossDescriptor); \
__macro(cudnnGetCTCLossDescriptor); \
__macro(cudnnSetCTCLossDescriptor); \
__macro(cudnnGetCTCLossWorkspaceSize); \
__macro(cudnnCTCLoss); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
__macro(cudnnGetConvolutionForwardAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7201
#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
__macro(cudnnCreateRNNDataDescriptor); \
__macro(cudnnDestroyRNNDataDescriptor); \
__macro(cudnnSetRNNDataDescriptor); \
__macro(cudnnSetRNNPaddingMode); \
__macro(cudnnRNNForwardTrainingEx); \
__macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx);
CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7401
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
__macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 8000
#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) \
__macro(cudnnSetRNNDescriptor_v8); \
__macro(cudnnCreateFusedOpsPlan); \
__macro(cudnnCreateFusedOpsConstParamPack); \
__macro(cudnnCreateFusedOpsVariantParamPack); \
__macro(cudnnDestroyFusedOpsPlan); \
__macro(cudnnDestroyFusedOpsConstParamPack); \
__macro(cudnnDestroyFusedOpsVariantParamPack); \
__macro(cudnnFusedOpsExecute); \
__macro(cudnnSetFusedOpsConstParamPackAttribute); \
__macro(cudnnSetFusedOpsVariantParamPackAttribute); \
__macro(cudnnMakeFusedOpsPlan);
CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload
} // namespace pten
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/cufft.h"
#include "paddle/fluid/platform/enforce.h"
namespace pten {
namespace dynload {
std::once_flag cufft_dso_flag;
void* cufft_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUFFT() {
std::call_once(cufft_dso_flag,
[]() { cufft_dso_handle = GetCUFFTDsoHandle(); });
return cufft_dso_handle != nullptr;
}
void EnforceCUFFTLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cufft_dso_handle,
paddle::platform::errors::PreconditionNotMet(
"Cannot load cufft shared library. Cannot invoke method %s.",
fn_name));
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cufft.h>
#include <cufftXt.h>
#include <glog/logging.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag cufft_dso_flag;
extern void* cufft_dso_handle;
extern bool HasCUFFT();
extern void EnforceCUFFTLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cufft_func = decltype(&::__name); \
std::call_once(cufft_dso_flag, []() { \
cufft_dso_handle = pten::dynload::GetCUFFTDsoHandle(); \
}); \
EnforceCUFFTLoaded(#__name); \
static void* p_##__name = dlsym(cufft_dso_handle, #__name); \
return reinterpret_cast<cufft_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed cufft functions in HPPL
* different cufft version has different interfaces
**/
#define CUFFT_FFT_ROUTINE_EACH(__macro) \
__macro(cufftPlan1d); \
__macro(cufftPlan2d); \
__macro(cufftPlan3d); \
__macro(cufftPlanMany); \
__macro(cufftMakePlan1d); \
__macro(cufftMakePlan2d); \
__macro(cufftMakePlan3d); \
__macro(cufftMakePlanMany); \
__macro(cufftMakePlanMany64); \
__macro(cufftGetSizeMany64); \
__macro(cufftEstimate1d); \
__macro(cufftEstimate2d); \
__macro(cufftEstimate3d); \
__macro(cufftEstimateMany); \
__macro(cufftCreate); \
__macro(cufftGetSize1d); \
__macro(cufftGetSize2d); \
__macro(cufftGetSize3d); \
__macro(cufftGetSizeMany); \
__macro(cufftGetSize); \
__macro(cufftSetWorkArea); \
__macro(cufftSetAutoAllocation); \
__macro(cufftExecC2C); \
__macro(cufftExecR2C); \
__macro(cufftExecC2R); \
__macro(cufftExecZ2Z); \
__macro(cufftExecD2Z); \
__macro(cufftExecZ2D); \
__macro(cufftSetStream); \
__macro(cufftDestroy); \
__macro(cufftGetVersion); \
__macro(cufftGetProperty); \
__macro(cufftXtSetGPUs); \
__macro(cufftXtMalloc); \
__macro(cufftXtMemcpy); \
__macro(cufftXtFree); \
__macro(cufftXtSetWorkArea); \
__macro(cufftXtExecDescriptorC2C); \
__macro(cufftXtExecDescriptorR2C); \
__macro(cufftXtExecDescriptorC2R); \
__macro(cufftXtExecDescriptorZ2Z); \
__macro(cufftXtExecDescriptorD2Z); \
__macro(cufftXtExecDescriptorZ2D); \
__macro(cufftXtQueryPlan); \
__macro(cufftXtSetCallback); \
__macro(cufftXtClearCallback); \
__macro(cufftXtSetCallbackSharedSize); \
__macro(cufftXtMakePlanMany); \
__macro(cufftXtGetSizeMany); \
__macro(cufftXtExec); \
__macro(cufftXtExecDescriptor); \
__macro(cufftXtSetWorkAreaPolicy);
CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
} // namespace dynload
} // namespace pten
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUPTI
#include "paddle/pten/backends/dynload/cupti.h"
namespace pten {
namespace dynload {
std::once_flag cupti_dso_flag;
void *cupti_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUPTI_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
#endif // PADDLE_WITH_CUPTI
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUPTI
#include <cuda.h>
#include <cupti.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag cupti_dso_flag;
extern void *cupti_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cupti routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
using cuptiFunc = decltype(&::__name); \
std::call_once(cupti_dso_flag, []() { \
cupti_dso_handle = pten::dynload::GetCUPTIDsoHandle(); \
}); \
static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define CUPTI_ROUTINE_EACH(__macro) \
__macro(cuptiActivityEnable); \
__macro(cuptiActivityDisable); \
__macro(cuptiActivityRegisterCallbacks); \
__macro(cuptiActivityGetAttribute); \
__macro(cuptiActivitySetAttribute); \
__macro(cuptiGetTimestamp); \
__macro(cuptiActivityGetNextRecord); \
__macro(cuptiGetResultString); \
__macro(cuptiActivityGetNumDroppedRecords); \
__macro(cuptiActivityFlushAll); \
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback); \
__macro(cuptiEnableDomain);
CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
} // namespace dynload
} // namespace pten
#endif // PADDLE_WITH_CUPTI
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/curand.h"
namespace pten {
namespace dynload {
std::once_flag curand_dso_flag;
void *curand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <curand.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag curand_dso_flag;
extern void *curand_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
curandStatus_t operator()(Args... args) { \
using curandFunc = decltype(&::__name); \
std::call_once(curand_dso_flag, []() { \
curand_dso_handle = pten::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(curand_dso_handle, #__name); \
return reinterpret_cast<curandFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define CURAND_RAND_ROUTINE_EACH(__macro) \
__macro(curandCreateGenerator); \
__macro(curandSetStream); \
__macro(curandSetPseudoRandomGeneratorSeed); \
__macro(curandGenerateUniform); \
__macro(curandGenerateUniformDouble); \
__macro(curandGenerateNormal); \
__macro(curandDestroyGenerator);
CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/cusolver.h"
namespace pten {
namespace dynload {
std::once_flag cusolver_dso_flag;
void *cusolver_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
#ifdef CUSOLVER_ROUTINE_EACH_R1
CUSOLVER_ROUTINE_EACH_R1(DEFINE_WRAP);
#endif
#ifdef CUSOLVER_ROUTINE_EACH_R2
CUSOLVER_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda.h>
#include <cusolverDn.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag cusolver_dso_flag;
extern void *cusolver_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cusolverStatus_t operator()(Args... args) { \
using cusolverFunc = decltype(&::__name); \
std::call_once(cusolver_dso_flag, []() { \
cusolver_dso_handle = pten::dynload::GetCusolverDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
return reinterpret_cast<cusolverFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define CUSOLVER_ROUTINE_EACH(__macro) \
__macro(cusolverDnCreate); \
__macro(cusolverDnDestroy); \
__macro(cusolverDnSetStream); \
__macro(cusolverDnSpotrf_bufferSize); \
__macro(cusolverDnDpotrf_bufferSize); \
__macro(cusolverDnSpotrf); \
__macro(cusolverDnDpotrf); \
__macro(cusolverDnSpotrs); \
__macro(cusolverDnDpotrs); \
__macro(cusolverDnCpotrs); \
__macro(cusolverDnZpotrs); \
__macro(cusolverDnSsyevd_bufferSize); \
__macro(cusolverDnDsyevd_bufferSize); \
__macro(cusolverDnCheevd_bufferSize); \
__macro(cusolverDnZheevd_bufferSize); \
__macro(cusolverDnSsyevd); \
__macro(cusolverDnDsyevd); \
__macro(cusolverDnCheevd); \
__macro(cusolverDnZheevd);
CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
#if CUDA_VERSION >= 9020
#define CUSOLVER_ROUTINE_EACH_R1(__macro) \
__macro(cusolverDnSpotrfBatched); \
__macro(cusolverDnDpotrfBatched); \
__macro(cusolverDnSpotrsBatched); \
__macro(cusolverDnDpotrsBatched); \
__macro(cusolverDnSgesvdj_bufferSize); \
__macro(cusolverDnSgetrf_bufferSize); \
__macro(cusolverDnDgetrf_bufferSize); \
__macro(cusolverDnCgetrf_bufferSize); \
__macro(cusolverDnZgetrf_bufferSize); \
__macro(cusolverDnSgeqrf_bufferSize); \
__macro(cusolverDnDgeqrf_bufferSize); \
__macro(cusolverDnCgeqrf_bufferSize); \
__macro(cusolverDnZgeqrf_bufferSize); \
__macro(cusolverDnSorgqr_bufferSize); \
__macro(cusolverDnDorgqr_bufferSize); \
__macro(cusolverDnSormqr_bufferSize); \
__macro(cusolverDnDormqr_bufferSize); \
__macro(cusolverDnCungqr_bufferSize); \
__macro(cusolverDnZungqr_bufferSize); \
__macro(cusolverDnDestroyGesvdjInfo); \
__macro(cusolverDnCreateGesvdjInfo); \
__macro(cusolverDnDgesvdj_bufferSize); \
__macro(cusolverDnSgesvdj); \
__macro(cusolverDnDgesvdj); \
__macro(cusolverDnSgetrf); \
__macro(cusolverDnDgetrf); \
__macro(cusolverDnCgetrf); \
__macro(cusolverDnZgetrf); \
__macro(cusolverDnSgeqrf); \
__macro(cusolverDnDgeqrf); \
__macro(cusolverDnCgeqrf); \
__macro(cusolverDnZgeqrf); \
__macro(cusolverDnSorgqr); \
__macro(cusolverDnDorgqr); \
__macro(cusolverDnSormqr); \
__macro(cusolverDnDormqr); \
__macro(cusolverDnCungqr); \
__macro(cusolverDnZungqr);
CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif
#if CUDA_VERSION >= 9020
#define CUSOLVER_ROUTINE_EACH_R2(__macro) \
__macro(cusolverDnCreateSyevjInfo); \
__macro(cusolverDnSsyevj_bufferSize); \
__macro(cusolverDnDsyevj_bufferSize); \
__macro(cusolverDnSsyevj); \
__macro(cusolverDnDsyevj); \
__macro(cusolverDnDestroySyevjInfo);
CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/cusparse.h"
namespace pten {
namespace dynload {
std::once_flag cusparse_dso_flag;
void *cusparse_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
#ifdef CUSPARSE_ROUTINE_EACH
CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
#endif
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
#ifdef CUSPARSE_ROUTINE_EACH_11020
CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda.h>
#include <cusparse.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag cusparse_dso_flag;
extern void *cusparse_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cusparseStatus_t operator()(Args... args) { \
using cusparseFunc = decltype(&::__name); \
std::call_once(cusparse_dso_flag, []() { \
cusparse_dso_handle = pten::dynload::GetCusparseDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
return reinterpret_cast<cusparseFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);
CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
__macro(cusparseSpMM_bufferSize); \
__macro(cusparseSpMM); \
__macro(cusparseDestroySpMat); \
__macro(cusparseDestroyDnMat); \
__macro(cusparseCooSetPointers); \
__macro(cusparseCsrSetPointers); \
__macro(cusparseDenseToSparse_bufferSize); \
__macro(cusparseDenseToSparse_analysis); \
__macro(cusparseDenseToSparse_convert); \
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(cusparseSDDMM_bufferSize); \
__macro(cusparseSDDMM_preprocess); \
__macro(cusparseSDDMM);
CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif
#endif
#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include <cstdlib>
#include <string>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/backends/dynload/cupti_lib_path.h"
#if defined(_WIN32)
#include <windows.h>
#endif
// TODO(wilber): The pten computing library requires a component to manage flags
// (maybe not use gflags).
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_string(cudnn_dir,
"",
"Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH");
DEFINE_string(
cuda_dir,
"",
"Specify path for loading cuda library, such as libcublas, libcublasLt "
"libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
"If default, dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(nccl_dir,
"",
"Specify path for loading nccl library, such as libnccl.so. "
"For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(hccl_dir,
"",
"Specify path for loading hccl library, such as libhccl.so. "
"For instance, "
"/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
"default, "
"dlopen will search hccl from LD_LIBRARY_PATH");
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
DEFINE_string(
tensorrt_dir,
"",
"Specify path for loading tensorrt library, such as libnvinfer.so.");
DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
DEFINE_string(mkl_dir,
"",
"Specify path for loading libmkl_rt.so. "
"For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/."
"If default, "
"dlopen will search mkl from LD_LIBRARY_PATH");
DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
#ifdef PADDLE_WITH_HIP
DEFINE_string(miopen_dir,
"",
"Specify path for loading libMIOpen.so. For instance, "
"/opt/rocm/miopen/lib. If empty [default], dlopen "
"will search miopen from LD_LIBRARY_PATH");
DEFINE_string(rocm_dir,
"",
"Specify path for loading rocm library, such as librocblas, "
"libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
"If default, dlopen will search rocm from LD_LIBRARY_PATH");
DEFINE_string(rccl_dir,
"",
"Specify path for loading rccl library, such as librccl.so. "
"For instance, /opt/rocm/rccl/lib. If default, "
"dlopen will search rccl from LD_LIBRARY_PATH");
#endif
namespace pten {
namespace dynload {
struct PathNode {
PathNode() {}
std::string path = "";
};
static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
// NOTE: In order to adapt to the default installation path of cuda
#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
#else
static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";
#endif
static PathNode s_py_site_pkg_path;
#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
static constexpr char* win_cublas_lib =
"cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
#if CUDA_VERSION >= 11000
static constexpr char* win_curand_lib =
"curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
static constexpr char* win_nvjpeg_lib =
"nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
static constexpr char* win_cusolver_lib =
"cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
static constexpr char* win_cusparse_lib =
"cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
static constexpr char* win_cufft_lib =
"cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll";
#else
static constexpr char* win_curand_lib =
"curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
static constexpr char* win_nvjpeg_lib =
"nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
static constexpr char* win_cusolver_lib =
"cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
static constexpr char* win_cusparse_lib =
"cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
static constexpr char* win_cufft_lib =
"cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
#endif // CUDA_VERSION
#endif
static inline std::string join(const std::string& part1,
const std::string& part2) {
// directory separator
const char sep = '/';
if (!part2.empty() && part2.front() == sep) {
return part2;
}
std::string ret;
ret.reserve(part1.size() + part2.size() + 1);
ret = part1;
if (!ret.empty() && ret.back() != sep) {
ret += sep;
}
ret += part2;
return ret;
}
static inline std::vector<std::string> split(
const std::string& str, const std::string separator = " ") {
std::vector<std::string> str_list;
std::string::size_type firstPos;
firstPos = str.find_first_not_of(separator, 0);
std::string::size_type lastPos;
lastPos = str.find_first_of(separator, firstPos);
while (std::string::npos != firstPos && std::string::npos != lastPos) {
str_list.push_back(str.substr(firstPos, lastPos - firstPos));
firstPos = str.find_first_not_of(separator, lastPos);
lastPos = str.find_first_of(separator, firstPos);
}
if (std::string::npos == lastPos) {
str_list.push_back(str.substr(firstPos, lastPos - firstPos));
}
return str_list;
}
void SetPaddleLibPath(const std::string& py_site_pkg_path) {
s_py_site_pkg_path.path = py_site_pkg_path;
VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
}
static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
const std::string& dso_name,
int dynload_flags) {
void* dso_handle = nullptr;
if (!spec_path.empty()) {
// search xxx.so from custom path
VLOG(3) << "Try to find library: " << dso_name
<< " from specific path: " << spec_path;
std::string dso_path = join(spec_path, dso_name);
dso_handle = dlopen(dso_path.c_str(), dynload_flags);
}
return dso_handle;
}
static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
int dynload_flags) {
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
// and /usr/local/lib path
void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
VLOG(3) << "Try to find library: " << dso_path
<< " from default system path.";
// TODO(chenweihang): This path is used to search which libs?
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
// bring System Integrity Projection (SIP), if dso_handle
// is null, search from default package path in Mac OS.
#if defined(__APPLE__) || defined(__OSX__)
if (nullptr == dso_handle) {
dso_handle =
dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
}
#endif
return dso_handle;
}
/*
* We define three priorities for dynamic library search:
*
* First: Search for path specified by the user
* Second: Search the stheystem default path
* Third: Search for a special path corresponding to
* a specific library to adapt to changes and easy to expand.
*/
static inline void* GetDsoHandleFromSearchPath(
const std::string& config_path,
const std::string& dso_name,
bool throw_on_error = true,
const std::vector<std::string>& extra_paths = std::vector<std::string>(),
const std::string& warning_msg = std::string()) {
#if !defined(_WIN32)
int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
#else
int dynload_flags = 0;
#endif // !_WIN32
std::vector<std::string> dso_names = split(dso_name, ";");
void* dso_handle = nullptr;
for (auto dso : dso_names) {
// 1. search in user config path by FLAGS
dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
// 2. search in system default path
if (nullptr == dso_handle) {
dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
}
// 3. search in extra paths
if (nullptr == dso_handle) {
for (auto path : extra_paths) {
VLOG(3) << "extra_paths: " << path;
dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
}
}
if (nullptr != dso_handle) break;
}
// 4. [If Failed for All dso_names] logging warning if exists
if (nullptr == dso_handle && !warning_msg.empty()) {
LOG(WARNING) << warning_msg;
}
// 5. [If Failed for All dso_names] logging or throw error info
if (nullptr == dso_handle) {
auto error_msg =
"The third-party dynamic library (%s) that Paddle depends on is not "
"configured correctly. (error code is %s)\n"
" Suggestions:\n"
" 1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
"is installed correctly and its version is matched with paddlepaddle "
"you installed.\n"
" 2. Configure third-party dynamic library environment variables as "
"follows:\n"
" - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
" - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
" - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
"[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
"impossible unless System Integrity Protection (SIP) is disabled.]";
#if !defined(_WIN32)
auto errorno = dlerror();
#else
auto errorno = GetLastError();
#endif // !_WIN32
if (throw_on_error) {
// NOTE: Special error report case, no need to change its format
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
error_msg, dso_name, errorno));
} else {
LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
}
}
return dso_handle;
}
void* GetCublasDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(
FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
#endif
}
void* GetCublasLtDsoHandle() {
// APIs available after CUDA 10.1
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
#else
std::string warning_msg(
"Your CUDA_VERSION less 10.1, not support CublasLt. "
"If you want to use CublasLt, please upgrade CUDA and rebuild "
"PaddlePaddle.");
return nullptr;
#endif
}
void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
std::string mac_warn_meg(
"Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
"For instance, sudo tar -xzf "
"cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
"chmod a+r /usr/local/cuda/include/cudnn.h "
"/usr/local/cuda/lib/libcudnn*");
return GetDsoHandleFromSearchPath(
FLAGS_cudnn_dir, "libcudnn.dylib", false, {}, mac_warn_meg);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
std::string win_warn_meg(
"Note: [Recommend] copy cudnn into CUDA installation directory. \n "
"For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
"NVIDIA's official website, \n"
"then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
"Toolkit\\CUDA\\v10.0\n"
"You should do this according to your CUDA installation directory and "
"CUDNN version.");
return GetDsoHandleFromSearchPath(
FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
#else
return GetDsoHandleFromSearchPath(
FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
#endif
}
void* GetCUPTIDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(
FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
#else
return GetDsoHandleFromSearchPath(
FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
#endif
}
void* GetCurandDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(
FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
#endif
}
#ifdef PADDLE_WITH_HIP
void* GetROCFFTDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so");
#endif
}
#endif
void* GetNvjpegDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(
FLAGS_cuda_dir, win_nvjpeg_lib, true, {cuda_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
#endif
}
void* GetCusolverDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(
FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
#endif
}
void* GetCusparseDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(
FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
#endif
}
void* GetNVRTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
#endif
}
void* GetCUDADsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
#elif defined(_WIN32)
char system32_dir[MAX_PATH];
GetSystemDirectory(system32_dir, MAX_PATH);
return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
#endif
}
void* GetWarpCTCDsoHandle() {
std::string warpctc_dir = "";
if (!s_py_site_pkg_path.path.empty()) {
warpctc_dir = s_py_site_pkg_path.path;
}
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
#else
return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
#endif
}
void* GetNCCLDsoHandle() {
#ifdef PADDLE_WITH_HIP
std::string warning_msg(
"You may need to install 'rccl' from ROCM official website: "
"https://rocmdocs.amd.com/en/latest/Installation_Guide/"
"Installation-Guide.html before install PaddlePaddle.");
#else
std::string warning_msg(
"You may need to install 'nccl2' from NVIDIA official website: "
"https://developer.nvidia.com/nccl/nccl-download"
"before install PaddlePaddle.");
#endif
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(
FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(
FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
#else
return GetDsoHandleFromSearchPath(
FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
#endif
}
void* GetHCCLDsoHandle() {
std::string warning_msg(
"You may need to install 'hccl2' from Huawei official website: "
"before install PaddlePaddle.");
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(
FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
#elif defined(PADDLE_WITH_ASCEND_CL)
return GetDsoHandleFromSearchPath(
FLAGS_hccl_dir, "libhccl.so", true, {}, warning_msg);
#else
return GetDsoHandleFromSearchPath(
FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
#endif
}
void* GetTensorRtDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
#endif
}
void* GetMKLMLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
#endif
}
void* GetLAPACKDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
#endif
}
void* GetOpDsoHandle(const std::string& dso_name) {
return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
}
void* GetNvtxDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
PADDLE_THROW(
paddle::platform::errors::Unimplemented("Nvtx do not support Apple."));
#elif defined(_WIN32)
PADDLE_THROW(
paddle::platform::errors::Unimplemented("Nvtx do not support Windows."));
#elif !defined(PADDLE_WITH_CUDA)
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Nvtx do not support without CUDA."));
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
#endif
}
void* GetCUFFTDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(
FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
#endif
}
void* GetMKLRTDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
#endif
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
namespace pten {
namespace dynload {
#ifndef _WIN32
#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
#else
#define DECLARE_TYPE(__name, ...) decltype(auto)
#endif
void* GetCublasDsoHandle();
void* GetCublasLtDsoHandle();
void* GetCUDNNDsoHandle();
void* GetCUPTIDsoHandle();
void* GetCurandDsoHandle();
void* GetNvjpegDsoHandle();
void* GetCusolverDsoHandle();
void* GetCusparseDsoHandle();
void* GetNVRTCDsoHandle();
void* GetCUDADsoHandle();
void* GetWarpCTCDsoHandle();
void* GetNCCLDsoHandle();
void* GetHCCLDsoHandle();
void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle();
void* GetLAPACKDsoHandle();
void* GetOpDsoHandle(const std::string& dso_name);
void* GetNvtxDsoHandle();
void* GetCUFFTDsoHandle();
void* GetMKLRTDsoHandle();
void* GetROCFFTDsoHandle();
void SetPaddleLibPath(const std::string&);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/hipfft.h"
namespace pten {
namespace dynload {
std::once_flag hipfft_dso_flag;
void *hipfft_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_HIP
#include <hipfft.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag hipfft_dso_flag;
extern void *hipfft_dso_handle;
#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hipfftFunc = decltype(&::__name); \
std::call_once(hipfft_dso_flag, []() { \
hipfft_dso_handle = pten::dynload::GetROCFFTDsoHandle(); \
}); \
static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \
return reinterpret_cast<hipfftFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
__macro(hipfftPlan1d); \
__macro(hipfftPlan2d); \
__macro(hipfftPlan3d); \
__macro(hipfftPlanMany); \
__macro(hipfftMakePlan1d); \
__macro(hipfftMakePlanMany); \
__macro(hipfftMakePlanMany64); \
__macro(hipfftGetSizeMany64); \
__macro(hipfftEstimate1d); \
__macro(hipfftEstimate2d); \
__macro(hipfftEstimate3d); \
__macro(hipfftEstimateMany); \
__macro(hipfftCreate); \
__macro(hipfftGetSize1d); \
__macro(hipfftGetSizeMany); \
__macro(hipfftGetSize); \
__macro(hipfftSetWorkArea); \
__macro(hipfftSetAutoAllocation); \
__macro(hipfftExecC2C); \
__macro(hipfftExecR2C); \
__macro(hipfftExecC2R); \
__macro(hipfftExecZ2Z); \
__macro(hipfftExecD2Z); \
__macro(hipfftExecZ2D); \
__macro(hipfftSetStream); \
__macro(hipfftDestroy); \
__macro(hipfftGetVersion); \
__macro(hipfftGetProperty);
HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
inline const char *hipfftGetErrorString(hipfftResult_t status) {
switch (status) {
case HIPFFT_SUCCESS:
return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
case HIPFFT_INVALID_PLAN:
return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
case HIPFFT_ALLOC_FAILED:
return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
"memory.";
case HIPFFT_INVALID_TYPE:
return "'HIPFFT_INVALID_TYPE'. No longer used.";
case HIPFFT_INVALID_VALUE:
return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
"parameter.";
case HIPFFT_INTERNAL_ERROR:
return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
"error.";
case HIPFFT_EXEC_FAILED:
return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
case HIPFFT_SETUP_FAILED:
return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
case HIPFFT_INVALID_SIZE:
return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
case HIPFFT_UNALIGNED_DATA:
return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
case HIPFFT_INCOMPLETE_PARAMETER_LIST:
return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
case HIPFFT_INVALID_DEVICE:
return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
"GPU than plan creation.";
case HIPFFT_PARSE_ERROR:
return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
case HIPFFT_NO_WORKSPACE:
return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
"plan execution.";
case HIPFFT_NOT_IMPLEMENTED:
return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
"functionality for parameters given.";
case HIPFFT_NOT_SUPPORTED:
return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
"parameters given.";
default:
return "HIPFFT_STATUS_UNKNOWN_ERROR";
}
}
} // namespace dynload
} // namespace pten
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/hiprand.h"
namespace pten {
namespace dynload {
std::once_flag hiprand_dso_flag;
void *hiprand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <hiprand.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/port.h"
#include "paddle/pten/backends/dynload/dynamic_loader.h"
namespace pten {
namespace dynload {
extern std::once_flag hiprand_dso_flag;
extern void *hiprand_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
hiprandStatus_t operator()(Args... args) { \
using hiprandFunc = decltype(&::__name); \
std::call_once(hiprand_dso_flag, []() { \
hiprand_dso_handle = pten::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \
return reinterpret_cast<hiprandFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define HIPRAND_RAND_ROUTINE_EACH(__macro) \
__macro(hiprandCreateGenerator); \
__macro(hiprandSetStream); \
__macro(hiprandSetPseudoRandomGeneratorSeed); \
__macro(hiprandGenerateUniform); \
__macro(hiprandGenerateUniformDouble); \
__macro(hiprandGenerateNormal); \
__macro(hiprandDestroyGenerator);
HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/hiprtc.h"
namespace pten {
namespace dynload {
std::once_flag hiprtc_dso_flag;
void* hiprtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() {
std::call_once(hiprtc_dso_flag,
[]() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
return hiprtc_dso_handle != nullptr;
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <hip/hiprtc.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag hiprtc_dso_flag;
extern void* hiprtc_dso_handle;
extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hiprtc_func = decltype(&::__name); \
std::call_once(hiprtc_dso_flag, []() { \
hiprtc_dso_handle = pten::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \
return reinterpret_cast<hiprtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed hiprtc functions
**/
#define HIPRTC_ROUTINE_EACH(__macro) \
__macro(hiprtcVersion); \
__macro(hiprtcGetErrorString); \
__macro(hiprtcCompileProgram); \
__macro(hiprtcCreateProgram); \
__macro(hiprtcDestroyProgram); \
__macro(hiprtcGetCode); \
__macro(hiprtcGetCodeSize); \
__macro(hiprtcGetProgramLog); \
__macro(hiprtcGetProgramLogSize)
HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/lapack.h"
#include <mutex>
namespace pten {
namespace dynload {
std::once_flag lapack_dso_flag;
void* lapack_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
LAPACK_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <complex>
#include <mutex>
#include "paddle/fluid/platform/complex.h"
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
// Note(zhouwei): because lapack doesn't provide appropriate header file.
// should expose API statement yourself.
// getrf_(For example)
extern "C" void dgetrf_(
int *m, int *n, double *a, int *lda, int *ipiv, int *info);
extern "C" void sgetrf_(
int *m, int *n, float *a, int *lda, int *ipiv, int *info);
// evd
extern "C" void zheevd_(char *jobz,
char *uplo,
int *n,
std::complex<double> *a,
int *lda,
double *w,
std::complex<double> *work,
int *lwork,
double *rwork,
int *lrwork,
int *iwork,
int *liwork,
int *info);
extern "C" void cheevd_(char *jobz,
char *uplo,
int *n,
std::complex<float> *a,
int *lda,
float *w,
std::complex<float> *work,
int *lwork,
float *rwork,
int *lrwork,
int *iwork,
int *liwork,
int *info);
extern "C" void dsyevd_(char *jobz,
char *uplo,
int *n,
double *a,
int *lda,
double *w,
double *work,
int *lwork,
int *iwork,
int *liwork,
int *info);
extern "C" void ssyevd_(char *jobz,
char *uplo,
int *n,
float *a,
int *lda,
float *w,
float *work,
int *lwork,
int *iwork,
int *liwork,
int *info);
// geev
extern "C" void dgeev_(char *jobvl,
char *jobvr,
int *n,
double *a,
int *lda,
double *wr,
double *wi,
double *vl,
int *ldvl,
double *vr,
int *ldvr,
double *work,
int *lwork,
int *info);
extern "C" void sgeev_(char *jobvl,
char *jobvr,
int *n,
float *a,
int *lda,
float *wr,
float *wi,
float *vl,
int *ldvl,
float *vr,
int *ldvr,
float *work,
int *lwork,
int *info);
extern "C" void zgeev_(char *jobvl,
char *jobvr,
int *n,
std::complex<double> *a,
int *lda,
std::complex<double> *w,
std::complex<double> *vl,
int *ldvl,
std::complex<double> *vr,
int *ldvr,
std::complex<double> *work,
int *lwork,
double *rwork,
int *info);
extern "C" void cgeev_(char *jobvl,
char *jobvr,
int *n,
std::complex<float> *a,
int *lda,
std::complex<float> *w,
std::complex<float> *vl,
int *ldvl,
std::complex<float> *vr,
int *ldvr,
std::complex<float> *work,
int *lwork,
float *rwork,
int *info);
// gels
extern "C" void dgels_(char *trans,
int *m,
int *n,
int *nrhs,
double *a,
int *lda,
double *b,
int *ldb,
double *work,
int *lwork,
int *info);
extern "C" void sgels_(char *trans,
int *m,
int *n,
int *nrhs,
float *a,
int *lda,
float *b,
int *ldb,
float *work,
int *lwork,
int *info);
// gelsd
extern "C" void dgelsd_(int *m,
int *n,
int *nrhs,
double *a,
int *lda,
double *b,
int *ldb,
double *s,
double *rcond,
int *rank,
double *work,
int *lwork,
int *iwork,
int *info);
extern "C" void sgelsd_(int *m,
int *n,
int *nrhs,
float *a,
int *lda,
float *b,
int *ldb,
float *s,
float *rcond,
int *rank,
float *work,
int *lwork,
int *iwork,
int *info);
// gelsy
extern "C" void dgelsy_(int *m,
int *n,
int *nrhs,
double *a,
int *lda,
double *b,
int *ldb,
int *jpvt,
double *rcond,
int *rank,
double *work,
int *lwork,
int *info);
extern "C" void sgelsy_(int *m,
int *n,
int *nrhs,
float *a,
int *lda,
float *b,
int *ldb,
int *jpvt,
float *rcond,
int *rank,
float *work,
int *lwork,
int *info);
// gelss
extern "C" void dgelss_(int *m,
int *n,
int *nrhs,
double *a,
int *lda,
double *b,
int *ldb,
double *s,
double *rcond,
int *rank,
double *work,
int *lwork,
int *info);
extern "C" void sgelss_(int *m,
int *n,
int *nrhs,
float *a,
int *lda,
float *b,
int *ldb,
float *s,
float *rcond,
int *rank,
float *work,
int *lwork,
int *info);
extern "C" void zpotrs_(char *uplo,
int *n,
int *nrhs,
std::complex<double> *a,
int *lda,
std::complex<double> *b,
int *ldb,
int *info);
extern "C" void cpotrs_(char *uplo,
int *n,
int *nrhs,
std::complex<float> *a,
int *lda,
std::complex<float> *b,
int *ldb,
int *info);
extern "C" void dpotrs_(char *uplo,
int *n,
int *nrhs,
double *a,
int *lda,
double *b,
int *ldb,
int *info);
extern "C" void spotrs_(char *uplo,
int *n,
int *nrhs,
float *a,
int *lda,
float *b,
int *ldb,
int *info);
namespace pten {
namespace dynload {
extern std::once_flag lapack_dso_flag;
extern void *lapack_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load lapack routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_LAPACK_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using lapackFunc = decltype(&::__name); \
std::call_once(lapack_dso_flag, []() { \
lapack_dso_handle = pten::dynload::GetLAPACKDsoHandle(); \
}); \
static void *p_##_name = dlsym(lapack_dso_handle, #__name); \
return reinterpret_cast<lapackFunc>(p_##_name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
DYNAMIC_LOAD_LAPACK_WRAP(__name)
#define LAPACK_ROUTINE_EACH(__macro) \
__macro(dgetrf_); \
__macro(sgetrf_); \
__macro(zheevd_); \
__macro(cheevd_); \
__macro(dsyevd_); \
__macro(ssyevd_); \
__macro(dgeev_); \
__macro(sgeev_); \
__macro(zgeev_); \
__macro(cgeev_); \
__macro(dgels_); \
__macro(sgels_); \
__macro(dgelsd_); \
__macro(sgelsd_); \
__macro(dgelsy_); \
__macro(sgelsy_); \
__macro(dgelss_); \
__macro(sgelss_); \
__macro(zpotrs_); \
__macro(cpotrs_); \
__macro(dpotrs_); \
__macro(spotrs_);
LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
#undef DYNAMIC_LOAD_LAPACK_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/fluid/platform/enforce.h"
namespace pten {
namespace dynload {
std::once_flag miopen_dso_flag;
void* miopen_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
MIOPEN_DNN_ROUTINE_EACH(DEFINE_WRAP);
MIOPEN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R3
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R4
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_R5
MIOPEN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_R6
MIOPEN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_R7
MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R7
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif
bool HasCUDNN() {
std::call_once(miopen_dso_flag,
[]() { miopen_dso_handle = GetCUDNNDsoHandle(); });
return miopen_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
miopen_dso_handle,
paddle::platform::errors::PreconditionNotMet(
"Cannot load miopen shared library. Cannot invoke method %s.",
fn_name));
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <miopen/miopen.h>
#include <miopen/version.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
#define MIOPEN_VERSION \
(MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
MIOPEN_VERSION_PATCH) // NOLINT
// MIOPEN only support NCHW, just for compatibility with CUDNN API
typedef enum {
MIOPEN_TENSOR_NCHW = 0,
MIOPEN_TENSOR_NHWC = 1,
} miopenTensorFormat_t;
namespace pten {
namespace dynload {
extern std::once_flag miopen_dso_flag;
extern void* miopen_dso_handle;
extern bool HasCUDNN();
inline const char* miopenGetErrorString(miopenStatus_t status) {
switch (status) {
case miopenStatusSuccess:
return "MIOPEN_STATUS_SUCCESS";
case miopenStatusNotInitialized:
return "MIOPEN_STATUS_NOT_INITIALIZED";
case miopenStatusInvalidValue:
return "MIOPEN_STATUS_INVALID_VALUE";
case miopenStatusBadParm:
return "MIOPEN_STATUS_BAD_PARAM";
case miopenStatusAllocFailed:
return "MIOPEN_STATUS_ALLOC_FAILED";
case miopenStatusInternalError:
return "MIOPEN_STATUS_INTERNAL_ERROR";
case miopenStatusNotImplemented:
return "MIOPEN_STATUS_NOT_IMPLEMENTED";
case miopenStatusUnsupportedOp:
return "MIOPEN_STATUS_UNSUPPORTED_OP";
case miopenStatusUnknownError:
default:
return "MIOPEN_STATUS_UNKNOWN_ERROR";
}
}
extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using miopen_func = decltype(&::__name); \
std::call_once(miopen_dso_flag, []() { \
miopen_dso_handle = pten::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(miopen_dso_handle, #__name); \
return reinterpret_cast<miopen_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed miopen functions in HPPL
**/
#define MIOPEN_DNN_ROUTINE_EACH(__macro) \
__macro(miopenGetVersion); \
__macro(miopenOpTensor); \
__macro(miopenSet4dTensorDescriptor); \
__macro(miopenSetTensorDescriptor); \
__macro(miopenInitConvolutionNdDescriptor); \
__macro(miopenFindConvolutionForwardAlgorithm); \
__macro(miopenGetConvolutionNdForwardOutputDim); \
__macro(miopenFindConvolutionBackwardDataAlgorithm); \
__macro(miopenFindConvolutionBackwardWeightsAlgorithm); \
__macro(miopenGetTensorDescriptor); \
__macro(miopenCreateTensorDescriptor); \
__macro(miopenDestroyTensorDescriptor); \
__macro(miopenGetTensorDescriptorSize); \
__macro(miopenSet2dPoolingDescriptor); \
__macro(miopenGet2dPoolingDescriptor); \
__macro(miopenGetPoolingNdForwardOutputDim); \
__macro(miopenCreateConvolutionDescriptor); \
__macro(miopenCreatePoolingDescriptor); \
__macro(miopenDestroyPoolingDescriptor); \
__macro(miopenPoolingGetWorkSpaceSize); \
__macro(miopenPoolingGetWorkSpaceSizeV2); \
__macro(miopenSetNdPoolingDescriptor); \
__macro(miopenInitConvolutionDescriptor); \
__macro(miopenDestroyConvolutionDescriptor); \
__macro(miopenGetConvolutionNdDescriptor); \
__macro(miopenDeriveBNTensorDescriptor); \
__macro(miopenCreate); \
__macro(miopenDestroy); \
__macro(miopenSetStream); \
__macro(miopenActivationForward); \
__macro(miopenActivationBackward); \
__macro(miopenConvolutionBackwardWeights); \
__macro(miopenConvolutionForward); \
__macro(miopenConvolutionForwardBias); \
__macro(miopenConvolutionBackwardBias); \
__macro(miopenConvolutionForwardGetWorkSpaceSize); \
__macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \
__macro(miopenTransformTensor); \
__macro(miopenPoolingForward); \
__macro(miopenPoolingBackward); \
__macro(miopenSoftmaxBackward); \
__macro(miopenSoftmaxBackward_V2); \
__macro(miopenSoftmaxForward); \
__macro(miopenSoftmaxForward_V2); \
__macro(miopenCreateDropoutDescriptor); \
__macro(miopenDestroyDropoutDescriptor); \
__macro(miopenRestoreDropoutDescriptor); \
__macro(miopenDropoutGetStatesSize); \
__macro(miopenSetDropoutDescriptor); \
__macro(miopenCreateRNNDescriptor); \
__macro(miopenDestroyRNNDescriptor); \
__macro(miopenSetRNNDescriptor); \
__macro(miopenSetRNNDescriptor_V2); \
__macro(miopenGetRNNParamsSize); \
__macro(miopenGetRNNWorkspaceSize); \
__macro(miopenGetRNNTrainingReserveSize); \
__macro(miopenRNNForwardTraining); \
__macro(miopenRNNBackwardData); \
__macro(miopenRNNBackwardWeights); \
__macro(miopenRNNForwardInference); \
__macro(miopenGetTensorNumBytes);
MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
__macro(miopenConvolutionBackwardData);
MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R3:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R4:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
__macro(miopenBatchNormalizationForwardTraining); \
__macro(miopenBatchNormalizationForwardInference); \
__macro(miopenBatchNormalizationBackward);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R5
#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \
__macro(miopenCreateActivationDescriptor); \
__macro(miopenSetActivationDescriptor); \
__macro(miopenGetActivationDescriptor); \
__macro(miopenDestroyActivationDescriptor);
MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R6
#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
/*__macro(miopenSetRNNDescriptor_v6);*/
MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(miopenSetConvolutionGroupCount); \
__macro(miopenCreateCTCLossDescriptor); \
__macro(miopenDestroyCTCLossDescriptor); \
__macro(miopenGetCTCLossDescriptor); \
__macro(miopenSetCTCLossDescriptor); \
__macro(miopenGetCTCLossWorkspaceSize); \
__macro(miopenCTCLoss);
MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
/*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
__macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/mklml.h"
namespace pten {
namespace dynload {
std::once_flag mklml_dso_flag;
void* mklml_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
MKLML_ROUTINE_EACH(DEFINE_WRAP);
#if !defined(_WIN32)
DEFINE_WRAP(mkl_scsrmm);
DEFINE_WRAP(mkl_dcsrmm);
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mkl.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag mklml_dso_flag;
extern void *mklml_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load mklml routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklmlFunc = decltype(&::__name); \
std::call_once(mklml_dso_flag, []() { \
mklml_dso_handle = pten::dynload::GetMKLMLDsoHandle(); \
}); \
static void *p_##_name = dlsym(mklml_dso_handle, #__name); \
return reinterpret_cast<mklmlFunc>(p_##_name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
#define MKLML_ROUTINE_EACH(__macro) \
__macro(cblas_sgemm); \
__macro(cblas_dgemm); \
__macro(cblas_cgemm); \
__macro(cblas_zgemm); \
__macro(cblas_saxpy); \
__macro(cblas_daxpy); \
__macro(cblas_caxpy); \
__macro(cblas_zaxpy); \
__macro(cblas_scopy); \
__macro(cblas_dcopy); \
__macro(cblas_ccopy); \
__macro(cblas_zcopy); \
__macro(cblas_sgemv); \
__macro(cblas_dgemv); \
__macro(cblas_cgemv); \
__macro(cblas_zgemv); \
__macro(cblas_strsm); \
__macro(cblas_dtrsm); \
__macro(cblas_ctrsm); \
__macro(cblas_ztrsm); \
__macro(cblas_sgemm_alloc); \
__macro(cblas_dgemm_alloc); \
__macro(cblas_sgemm_pack); \
__macro(cblas_dgemm_pack); \
__macro(cblas_sgemm_compute); \
__macro(cblas_dgemm_compute); \
__macro(cblas_sgemm_free); \
__macro(cblas_dgemm_free); \
__macro(cblas_sgemm_batch); \
__macro(cblas_dgemm_batch); \
__macro(cblas_cgemm_batch); \
__macro(cblas_zgemm_batch); \
__macro(cblas_sdot); \
__macro(cblas_ddot); \
__macro(cblas_sasum); \
__macro(cblas_dasum); \
__macro(cblas_isamax); \
__macro(cblas_idamax); \
__macro(cblas_sscal); \
__macro(cblas_dscal); \
__macro(vsAdd); \
__macro(vdAdd); \
__macro(vsSub); \
__macro(vdSub); \
__macro(vsMul); \
__macro(vdMul); \
__macro(vsDiv); \
__macro(vdDiv); \
__macro(vsExp); \
__macro(vdExp); \
__macro(vsSqr); \
__macro(vdSqr); \
__macro(vsPowx); \
__macro(vdPowx); \
__macro(vsInv); \
__macro(vdInv); \
__macro(vmsErf); \
__macro(vmdErf); \
__macro(MKL_Free_Buffers); \
__macro(MKL_Set_Num_Threads); \
__macro(MKL_Get_Max_Threads);
MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
#if !defined(_WIN32)
DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm);
#endif
#undef DYNAMIC_LOAD_MKLML_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/mklrt.h"
namespace pten {
namespace dynload {
std::once_flag mklrt_dso_flag;
void* mklrt_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
MKLDFTI_ROUTINE_EACH(DEFINE_WRAP);
DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc,
enum DFTI_CONFIG_VALUE prec,
enum DFTI_CONFIG_VALUE domain,
MKL_LONG dim,
MKL_LONG* sizes) {
if (prec == DFTI_SINGLE) {
if (dim == 1) {
return DftiCreateDescriptor_s_1d(desc, domain, sizes[0]);
} else {
return DftiCreateDescriptor_s_md(desc, domain, dim, sizes);
}
} else if (prec == DFTI_DOUBLE) {
if (dim == 1) {
return DftiCreateDescriptor_d_1d(desc, domain, sizes[0]);
} else {
return DftiCreateDescriptor_d_md(desc, domain, dim, sizes);
}
} else {
return DftiCreateDescriptor(desc, prec, domain, dim, sizes);
}
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mkl_dfti.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag mklrt_dso_flag;
extern void* mklrt_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load mkldfti routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklrtFunc = decltype(&::__name); \
std::call_once(mklrt_dso_flag, []() { \
mklrt_dso_handle = pten::dynload::GetMKLRTDsoHandle(); \
}); \
static void* p_##__name = dlsym(mklrt_dso_handle, #__name); \
return reinterpret_cast<mklrtFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
// mkl_dfti.h has a macro that shadows the function with the same name
// un-defeine this macro so as to export that function
#undef DftiCreateDescriptor
#define MKLDFTI_ROUTINE_EACH(__macro) \
__macro(DftiCreateDescriptor); \
__macro(DftiCreateDescriptor_s_1d); \
__macro(DftiCreateDescriptor_d_1d); \
__macro(DftiCreateDescriptor_s_md); \
__macro(DftiCreateDescriptor_d_md); \
__macro(DftiSetValue); \
__macro(DftiGetValue); \
__macro(DftiCommitDescriptor); \
__macro(DftiComputeForward); \
__macro(DftiComputeBackward); \
__macro(DftiFreeDescriptor); \
__macro(DftiErrorClass); \
__macro(DftiErrorMessage);
MKLDFTI_ROUTINE_EACH(DYNAMIC_LOAD_MKLRT_WRAP)
#undef DYNAMIC_LOAD_MKLRT_WRAP
// define another function to avoid naming conflict
DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc,
enum DFTI_CONFIG_VALUE prec,
enum DFTI_CONFIG_VALUE domain,
MKL_LONG dim,
MKL_LONG* sizes);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/nccl.h"
namespace pten {
namespace dynload {
std::once_flag nccl_dso_flag;
void *nccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
#if NCCL_VERSION_CODE >= 2212
NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2304
NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
#endif
#if NCCL_VERSION_CODE >= 21100
NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <nccl.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(nccl_dso_flag, []() { \
nccl_dso_handle = pten::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define NCCL_RAND_ROUTINE_EACH(__macro) \
__macro(ncclCommInitAll); \
__macro(ncclGetUniqueId); \
__macro(ncclCommInitRank); \
__macro(ncclCommDestroy); \
__macro(ncclCommCount); \
__macro(ncclCommCuDevice); \
__macro(ncclCommUserRank); \
__macro(ncclAllReduce); \
__macro(ncclBcast); \
__macro(ncclAllGather); \
__macro(ncclGroupStart); \
__macro(ncclGroupEnd); \
__macro(ncclReduce); \
__macro(ncclReduceScatter); \
__macro(ncclGetErrorString);
NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212
#define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2304
#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
#define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \
__macro(ncclRecv);
NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 21100
#define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
__macro(ncclRedOpCreatePreMulSum); \
__macro(ncclRedOpDestroy);
NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/nvjpeg.h"
namespace pten {
namespace dynload {
std::once_flag nvjpeg_dso_flag;
void *nvjpeg_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <nvjpeg.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag nvjpeg_dso_flag;
extern void *nvjpeg_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
nvjpegStatus_t operator()(Args... args) { \
using nvjpegFunc = decltype(&::__name); \
std::call_once(nvjpeg_dso_flag, []() { \
nvjpeg_dso_handle = pten::dynload::GetNvjpegDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \
return reinterpret_cast<nvjpegFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define NVJPEG_RAND_ROUTINE_EACH(__macro) \
__macro(nvjpegCreateSimple); \
__macro(nvjpegJpegStateCreate); \
__macro(nvjpegGetImageInfo); \
__macro(nvjpegJpegStateDestroy); \
__macro(nvjpegDecode);
NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
} // namespace dynload
} // namespace pten
#endif
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/nvrtc.h"
namespace pten {
namespace dynload {
std::once_flag nvrtc_dso_flag;
void* nvrtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() {
std::call_once(nvrtc_dso_flag,
[]() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
return nvrtc_dso_handle != nullptr;
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <nvrtc.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag nvrtc_dso_flag;
extern void* nvrtc_dso_handle;
extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using nvrtc_func = decltype(&::__name); \
std::call_once(nvrtc_dso_flag, []() { \
nvrtc_dso_handle = pten::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(nvrtc_dso_handle, #__name); \
return reinterpret_cast<nvrtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed nvrtc functions
**/
#define NVRTC_ROUTINE_EACH(__macro) \
__macro(nvrtcVersion); \
__macro(nvrtcGetErrorString); \
__macro(nvrtcCompileProgram); \
__macro(nvrtcCreateProgram); \
__macro(nvrtcDestroyProgram); \
__macro(nvrtcGetPTX); \
__macro(nvrtcGetPTXSize); \
__macro(nvrtcGetProgramLog); \
__macro(nvrtcGetProgramLogSize)
NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include "paddle/pten/backends/dynload/nvtx.h"
namespace pten {
namespace dynload {
std::once_flag nvtx_dso_flag;
void *nvtx_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVTX_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
#endif
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef _WIN32
#include <cuda.h>
#include <nvToolsExt.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag nvtx_dso_flag;
extern void *nvtx_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
int operator()(Args... args) { \
using nvtxFunc = decltype(&::__name); \
std::call_once(nvtx_dso_flag, []() { \
nvtx_dso_handle = pten::dynload::GetNvtxDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \
return reinterpret_cast<nvtxFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \
__macro(nvtxRangePop);
NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
} // namespace dynload
} // namespace pten
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/rccl.h"
namespace pten {
namespace dynload {
std::once_flag rccl_dso_flag;
void *rccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
#if NCCL_VERSION_CODE >= 2212
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <rccl.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag rccl_dso_flag;
extern void* rccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(rccl_dso_flag, []() { \
rccl_dso_handle = pten::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(rccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define RCCL_RAND_ROUTINE_EACH(__macro) \
__macro(ncclCommInitAll); \
__macro(ncclGetUniqueId); \
__macro(ncclCommInitRank); \
__macro(ncclCommDestroy); \
__macro(ncclCommCount); \
__macro(ncclCommCuDevice); \
__macro(ncclCommUserRank); \
__macro(ncclAllReduce); \
__macro(ncclBcast); \
__macro(ncclAllGather); \
__macro(ncclGroupStart); \
__macro(ncclGroupEnd); \
__macro(ncclReduce); \
__macro(ncclReduceScatter); \
__macro(ncclGetErrorString);
RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212
#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \
__macro(ncclRecv);
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/rocblas.h"
namespace pten {
namespace dynload {
std::once_flag rocblas_dso_flag;
void *rocblas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
ROCBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R2
ROCBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R3
ROCBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif
#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R4
ROCBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <hip/hip_runtime.h>
#include <rocblas.h>
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag rocblas_dso_flag;
extern void *rocblas_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
rocblas_status operator()(Args... args) { \
using rocblas_func = decltype(&::__name); \
std::call_once(rocblas_dso_flag, []() { \
rocblas_dso_handle = pten::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \
return reinterpret_cast<rocblas_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(rocblas_caxpy); \
__macro(rocblas_saxpy); \
__macro(rocblas_daxpy); \
__macro(rocblas_zaxpy); \
__macro(rocblas_sscal); \
__macro(rocblas_dscal); \
__macro(rocblas_scopy); \
__macro(rocblas_dcopy); \
__macro(rocblas_cgemv); \
__macro(rocblas_sgemv); \
__macro(rocblas_zgemv); \
__macro(rocblas_dgemv); \
__macro(rocblas_cgemm); \
__macro(rocblas_sgemm); \
__macro(rocblas_dgemm); \
__macro(rocblas_hgemm); \
__macro(rocblas_zgemm); \
__macro(rocblas_sgeam); \
__macro(rocblas_strsm); \
__macro(rocblas_dtrsm); \
__macro(rocblas_dgeam); \
__macro(rocblas_sgemm_batched); \
__macro(rocblas_dgemm_batched); \
__macro(rocblas_cgemm_batched); \
__macro(rocblas_zgemm_batched); \
__macro(rocblas_create_handle); \
__macro(rocblas_destroy_handle); \
__macro(rocblas_set_stream); \
__macro(rocblas_get_stream); \
__macro(rocblas_set_pointer_mode); \
__macro(rocblas_get_pointer_mode);
ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// APIs available after CUDA 8.0
#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
__macro(rocblas_gemm_ex); \
__macro(rocblas_sgemm_strided_batched); \
__macro(rocblas_dgemm_strided_batched); \
__macro(rocblas_cgemm_strided_batched); \
__macro(rocblas_zgemm_strided_batched); \
__macro(rocblas_hgemm_strided_batched);
ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// HIP not supported in ROCM3.5
// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
// __macro(cublasSetMathMode);
// __macro(cublasGetMathMode);
// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(rocblas_gemm_batched_ex); \
__macro(rocblas_gemm_strided_batched_ex);
ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/rocm_driver.h"
namespace pten {
namespace dynload {
std::once_flag rocm_dso_flag;
void* rocm_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
ROCM_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() {
std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
return rocm_dso_handle != nullptr;
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <hip/hip_runtime.h>
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
namespace pten {
namespace dynload {
extern std::once_flag rocm_dso_flag;
extern void* rocm_dso_handle;
extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using rocm_func = decltype(&::__name); \
std::call_once(rocm_dso_flag, []() { \
rocm_dso_handle = pten::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(rocm_dso_handle, #__name); \
return reinterpret_cast<rocm_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed cuda driver functions
**/
#define ROCM_ROUTINE_EACH(__macro) \
__macro(hipDriverGetVersion); \
__macro(hipGetErrorString); \
__macro(hipModuleLoadData); \
__macro(hipModuleGetFunction); \
__macro(hipModuleUnload); \
/*rocm3.5 not support the function*/ \
/* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \
__macro(hipModuleLaunchKernel); \
__macro(hipLaunchKernel); \
__macro(hipGetDevice); \
__macro(hipGetDeviceCount); \
__macro(hipDevicePrimaryCtxGetState)
ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/tensorrt.h"
#include <string>
namespace pten {
namespace dynload {
std::once_flag tensorrt_dso_flag;
void* tensorrt_dso_handle;
std::once_flag tensorrt_plugin_dso_flag;
void* tensorrt_plugin_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP);
TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP);
TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP);
void* GetDsoHandle(const std::string& dso_name) {
#if !defined(_WIN32)
int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
#else
int dynload_flags = 0;
#endif // !_WIN32
void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
if (nullptr == dso_handle) {
auto error_msg =
"You are using Paddle compiled with TensorRT, but TensorRT dynamic "
"library is not found. Ignore this if TensorRT is not needed.\n"
"The TensorRT that Paddle depends on is not configured correctly.\n"
" Suggestions:\n"
" 1. Check if the TensorRT is installed correctly and its version"
" is matched with paddlepaddle you installed.\n"
" 2. Configure environment variables as "
"follows:\n"
" - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
" - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
" - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...`\n";
LOG(WARNING) << error_msg;
}
return dso_handle;
}
void* GetTensorRtHandle() {
#if defined(__APPLE__) || defined(__OSX__)
std::string dso_name = "libnvinfer.dylib";
#elif defined(_WIN32)
std::string dso_name = "nvinfer.dll";
#else
std::string dso_name = "libnvinfer.so";
#endif
return GetDsoHandle(dso_name);
}
void* GetTensorRtPluginHandle() {
#if defined(__APPLE__) || defined(__OSX__)
std::string dso_name = "libnvinfer_plugin.dylib";
#elif defined(_WIN32)
std::string dso_name = "nvinfer_plugin.dll";
#else
std::string dso_name = "libnvinfer_plugin.so";
#endif
return GetDsoHandle(dso_name);
}
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <NvInfer.h>
#include <NvInferPlugin.h>
#if !defined(_WIN32)
#include <dlfcn.h>
#endif
#include <mutex> // NOLINT
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/backends/dynload/dynamic_loader.h"
namespace pten {
namespace dynload {
void* GetTensorRtHandle();
extern std::once_flag tensorrt_dso_flag;
extern void* tensorrt_dso_handle;
void* GetTensorRtPluginHandle();
extern std::once_flag tensorrt_plugin_dso_flag;
extern void* tensorrt_plugin_dso_handle;
#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
void* operator()(Args... args) { \
std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = pten::dynload::GetTensorRtHandle(); \
}); \
static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \
if (p_##__name == nullptr) { \
return nullptr; \
} \
using tensorrt_func = decltype(&::__name); \
auto ret = reinterpret_cast<tensorrt_func>(p_##__name)(args...); \
return static_cast<void*>(ret); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = pten::dynload::GetTensorRtHandle(); \
}); \
static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \
PADDLE_ENFORCE_NOT_NULL(p_##__name, \
paddle::platform::errors::Unavailable( \
"Load tensorrt api %s failed", #__name)); \
using tensorrt_func = decltype(&::__name); \
return reinterpret_cast<tensorrt_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
std::call_once(tensorrt_plugin_dso_flag, []() { \
tensorrt_plugin_dso_handle = pten::dynload::GetTensorRtPluginHandle(); \
}); \
static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name); \
PADDLE_ENFORCE_NOT_NULL(p_##__name, \
paddle::platform::errors::Unavailable( \
"Load tensorrt plugin %s failed", #__name)); \
using tensorrt_plugin_func = decltype(&::__name); \
return reinterpret_cast<tensorrt_plugin_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#ifdef NV_TENSORRT_MAJOR
#if (NV_TENSORRT_MAJOR >= 6)
#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
__macro(createInferBuilder_INTERNAL); \
__macro(createInferRuntime_INTERNAL); \
__macro(getPluginRegistry);
#else
#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
__macro(createInferBuilder_INTERNAL); \
__macro(createInferRuntime_INTERNAL);
#endif
#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \
__macro(getInferLibVersion);
#define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
__macro(initLibNvInferPlugins);
TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
#endif // end of NV_TENSORRT_MAJOR
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/dynload/warpctc.h"
namespace pten {
namespace dynload {
std::once_flag warpctc_dso_flag;
void* warpctc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mutex> // NOLINT
#include "paddle/pten/backends/dynload/dynamic_loader.h"
#include "paddle/pten/backends/dynload/port.h"
#include "warpctc/include/ctc.h"
namespace pten {
namespace dynload {
extern std::once_flag warpctc_dso_flag;
extern void* warpctc_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load warpctc routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using warpctcFunc = decltype(&::__name); \
std::call_once(warpctc_dso_flag, []() { \
warpctc_dso_handle = pten::dynload::GetWarpCTCDsoHandle(); \
}); \
static void* p_##_name = dlsym(warpctc_dso_handle, #__name); \
return reinterpret_cast<warpctcFunc>(p_##_name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
DYNAMIC_LOAD_WARPCTC_WRAP(__name)
#define WARPCTC_ROUTINE_EACH(__macro) \
__macro(get_warpctc_version); \
__macro(ctcGetStatusString); \
__macro(compute_ctc_loss); \
__macro(compute_ctc_loss_double); \
__macro(get_workspace_size); \
__macro(get_workspace_size_double)
WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
#undef DYNAMIC_LOAD_WARPCTC_WRAP
} // namespace dynload
} // namespace pten
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册