[PTEN] Move dynload from fluid to pten. (#39120)

* move dynload from fluid to pten. * fix ci compile * fix windows ci compile. * update * update * fix compile error

[PTEN] Move dynload from fluid to pten. (#39120)
* move dynload from fluid to pten. * fix ci compile * fix windows ci compile. * update * update * fix compile error
3c1dc6f6 · Wilber · GitHub · 11938ae1 · 3c1dc6f6 · 3c1dc6f6
123 changed file
--- a/paddle/fluid/distributed/service/brpc_utils.h
+++ b/paddle/fluid/distributed/service/brpc_utils.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace butil {
 class IOBuf;
@@ -78,11 +78,11 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
                                        const framework::Scope* scope);
 void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
-                          butil::IOBufBytesIterator& iobuf,
+                          butil::IOBufBytesIterator& iobuf,  // NOLINT
                          const platform::DeviceContext& ctx);
 void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
-                             butil::IOBufBytesIterator& iobuf,
+                             butil::IOBufBytesIterator& iobuf,  // NOLINT
                             const platform::DeviceContext& ctx);
 std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port);

--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -40,9 +40,9 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace paddle {
 namespace distributed {
@@ -202,7 +202,7 @@ class ValueBlock {
      // value = _alloc.acquire(value_length_);
      table[id] = value;
    } else {
-      value = (VALUE *)(void *)(res->second);
+      value = (VALUE *)(void *)(res->second);  // NOLINT
    }
    return value;
  }
@@ -282,8 +282,8 @@ class ValueBlock {
        value->unseen_days_++;
        if (value->unseen_days_ >= threshold) {
          butil::return_object(iter->second);
-          //_alloc.release(iter->second);
+          // _alloc.release(iter->second);
-          //_alloc.release(value);
+          // _alloc.release(value);
          iter = table.erase(iter);
        } else {
          ++iter;

--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -38,8 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/timer.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "paddle/fluid/framework/io/shell.h"
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@@ -34,8 +34,8 @@
 #include <utility>
 #include <vector>
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/backends/dynload/port.h"
 #if defined(__arm__) || defined(__aarch64__) || defined(__ARM_NEON) || \
    defined(__ARM_NEON__)

--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_desc.pb.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 #ifdef _WIN32
 #include <direct.h>

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -31,8 +31,8 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/backends/dynload/port.h"
 extern std::string paddle::framework::DataTypeToString(
    const framework::proto::VarType::Type type);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/errors.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/backends/dynload/port.h"
 DECLARE_bool(use_mkldnn);

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 DECLARE_bool(use_mkldnn);

--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/platform/device/npu/dynload/hccl.h
+++ b/paddle/fluid/platform/device/npu/dynload/hccl.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 #define HCOM_GROUP_PREFIX "HCOM_GROUP_"

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader)
 list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
+  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
 elseif (WITH_ASCEND_CL)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc)
 else()
-  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
+  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
 endif()
 if (WITH_MKLML)
-    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml)
 endif()
-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader)
+cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack)
 add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 if (MKL_FOUND AND WITH_ONEMKL)
  message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader)
+  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt)
  target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -20,16 +20,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/cublas.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cublas_dso_flag;
-extern void *cublas_dso_handle;
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load cublas routine
@@ -37,19 +33,8 @@ extern void *cublas_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
-  struct DynLoad__##__name {                                                 \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                              \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
-      using cublas_func =                                                    \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);          \
-      std::call_once(cublas_dso_flag, []() {                                 \
-        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
-      return reinterpret_cast<cublas_func>(p_##__name)(args...);             \
-    }                                                                        \
-  };                                                                         \
  extern DynLoad__##__name __name
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
@@ -99,7 +84,7 @@ extern void *cublas_dso_handle;
  __macro(cublasSgetrsBatched);           \
  __macro(cublasDgetrsBatched);
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 // APIs available after CUDA 8.0
 #if CUDA_VERSION >= 8000
@@ -111,7 +96,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
  __macro(cublasZgemmStridedBatched);        \
  __macro(cublasHgemmStridedBatched);
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 // APIs available after CUDA 9.0
@@ -120,7 +105,7 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
  __macro(cublasSetMathMode);                \
  __macro(cublasGetMathMode);
-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 // APIs available after CUDA 9.1
@@ -129,10 +114,10 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
  __macro(cublasGemmBatchedEx);              \
  __macro(cublasGemmStridedBatchedEx);
-CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
-#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cublasLt.cc
+++ b/paddle/fluid/platform/dynload/cublasLt.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cublasLt_dso_flag;
-void *cublasLt_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/cublasLt.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cublasLt_dso_flag;
-extern void *cublasLt_dso_handle;
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load cublasLt routine
@@ -36,20 +32,8 @@ extern void *cublasLt_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)   \
-  struct DynLoad__##__name {                                                \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                             \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cublasLt_func =                                                 \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
-      std::call_once(cublasLt_dso_flag, []() {                              \
-        cublasLt_dso_handle =                                               \
-            paddle::platform::dynload::GetCublasLtDsoHandle();              \
-      });                                                                   \
-      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
-      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
-    }                                                                       \
-  };                                                                        \
  extern DynLoad__##__name __name
 // APIs available after CUDA 10.1
@@ -69,10 +53,10 @@ extern void *cublasLt_dso_handle;
  __macro(cublasLtMatrixTransformDescDestroy); \
  __macro(cublasLtMatrixTransformDescSetAttribute);
-CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif
-#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ b/paddle/fluid/platform/dynload/cuda_driver.cc
@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
+#include "paddle/pten/backends/dynload/cuda_driver.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cuda_dso_flag;
-void* cuda_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 #if CUDA_VERSION >= 10020
@@ -28,10 +26,7 @@ CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
 #endif
 CUDA_ROUTINE_EACH(DEFINE_WRAP);
-bool HasCUDADriver() {
+bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
-  std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
-  return cuda_dso_handle != nullptr;
-}
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <cuda.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/cuda_driver.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cuda_dso_flag;
-extern void* cuda_dso_handle;
 extern bool HasCUDADriver();
-#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)                           \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)       \
-  struct DynLoad__##__name {                                             \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                          \
+  extern DynLoad__##__name __name
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
-      using cuda_func = decltype(&::__name);                             \
-      std::call_once(cuda_dso_flag, []() {                               \
-        cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(cuda_dso_handle, #__name);         \
-      return reinterpret_cast<cuda_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern struct DynLoad__##__name __name
 /**
 * include all needed cuda driver functions
@@ -72,12 +59,12 @@ extern bool HasCUDADriver();
  __macro(cuMemRelease);                  \
  __macro(cuMemAddressFree)
-CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+CUDA_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
 #endif
-CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+CUDA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
-#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/cudnn.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
@@ -45,19 +43,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
 #endif
-bool HasCUDNN() {
+bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
-  std::call_once(cudnn_dso_flag,
-                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
-  return cudnn_dso_handle != nullptr;
-}
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      cudnn_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load cudnn shared library. Cannot invoke method %s.",
-          fn_name));
-}
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -18,32 +18,17 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cudnn_dso_flag;
-extern void* cudnn_dso_handle;
 extern bool HasCUDNN();
-extern void EnforceCUDNNLoaded(const char* fn_name);
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)      \
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-  struct DynLoad__##__name {                                               \
+  extern DynLoad__##__name __name
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using cudnn_func = decltype(&::__name);                              \
-      std::call_once(cudnn_dso_flag, []() {                                \
-        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
-      });                                                                  \
-      EnforceCUDNNLoaded(#__name);                                         \
-      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);          \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
 /**
 * include all needed cudnn functions in HPPL
@@ -127,7 +112,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnGetActivationDescriptor);                   \
  __macro(cudnnDestroyActivationDescriptor);               \
  __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
@@ -135,7 +120,8 @@ CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnGetConvolutionForwardAlgorithm);          \
  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
  __macro(cudnnSetRNNDescriptor);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(
+    PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 #if CUDNN_VERSION >= 7001
@@ -153,7 +139,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
-CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 #if CUDNN_VERSION >= 7201
@@ -166,7 +152,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnRNNBackwardDataEx);                   \
  __macro(cudnnRNNBackwardWeightsEx);                \
  __macro(cudnnRNNForwardInferenceEx);
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 #if CUDNN_VERSION >= 7401
@@ -176,7 +162,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
  __macro(cudnnBatchNormalizationBackwardEx);                        \
  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 #if CUDNN_VERSION >= 8000
@@ -192,7 +178,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
  __macro(cudnnMakeFusedOpsPlan);
-CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R8(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/cufft.cc
+++ b/paddle/fluid/platform/dynload/cufft.cc
@@ -13,31 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/cufft.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cufft.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cufft_dso_flag;
-void* cufft_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
-bool HasCUFFT() {
+bool HasCUFFT() { return pten::dynload::HasCUFFT(); }
-  std::call_once(cufft_dso_flag,
-                 []() { cufft_dso_handle = GetCUFFTDsoHandle(); });
-  return cufft_dso_handle != nullptr;
-}
-void EnforceCUFFTLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      cufft_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load cufft shared library. Cannot invoke method %s.",
-          fn_name));
-}
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cufft.h
+++ b/paddle/fluid/platform/dynload/cufft.h
@@ -19,32 +19,17 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/cufft.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cufft_dso_flag;
-extern void* cufft_dso_handle;
 extern bool HasCUFFT();
-extern void EnforceCUFFTLoaded(const char* fn_name);
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)      \
-#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)                            \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-  struct DynLoad__##__name {                                               \
+  extern DynLoad__##__name __name
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using cufft_func = decltype(&::__name);                              \
-      std::call_once(cufft_dso_flag, []() {                                \
-        cufft_dso_handle = paddle::platform::dynload::GetCUFFTDsoHandle(); \
-      });                                                                  \
-      EnforceCUFFTLoaded(#__name);                                         \
-      static void* p_##__name = dlsym(cufft_dso_handle, #__name);          \
-      return reinterpret_cast<cufft_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
 /**
 * include all needed cufft functions in HPPL
@@ -104,7 +89,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
  __macro(cufftXtExecDescriptor);        \
  __macro(cufftXtSetWorkAreaPolicy);
-CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
+CUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
@@ -20,9 +20,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cupti_dso_flag;
-void *cupti_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 CUPTI_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <cupti.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/cupti.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cupti_dso_flag;
-extern void *cupti_dso_handle;
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load cupti routine
@@ -36,18 +32,8 @@ extern void *cupti_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)               \
-  struct DynLoad__##__name {                                               \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                            \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
-      using cuptiFunc = decltype(&::__name);                               \
-      std::call_once(cupti_dso_flag, []() {                                \
-        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
-      });                                                                  \
-      static void *p_##__name = dlsym(cupti_dso_handle, #__name);          \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
  extern DynLoad__##__name __name
 #define CUPTI_ROUTINE_EACH(__macro)           \

--- a/paddle/fluid/platform/dynload/curand.cc
+++ b/paddle/fluid/platform/dynload/curand.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag curand_dso_flag;
-void *curand_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -16,27 +16,14 @@ limitations under the License. */
 #include <curand.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/curand.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag curand_dso_flag;
-extern void *curand_dso_handle;
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    curandStatus_t operator()(Args... args) {                                \
-      using curandFunc = decltype(&::__name);                                \
-      std::call_once(curand_dso_flag, []() {                                 \
-        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
  extern DynLoad__##__name __name
 #define CURAND_RAND_ROUTINE_EACH(__macro)      \
@@ -48,7 +35,7 @@ extern void *curand_dso_handle;
  __macro(curandGenerateNormal);               \
  __macro(curandDestroyGenerator);
-CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+CURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cusolver.cc
+++ b/paddle/fluid/platform/dynload/cusolver.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cusolver_dso_flag;
-void *cusolver_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -17,28 +17,14 @@ limitations under the License. */
 #include <cusolverDn.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/cusolver.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cusolver_dso_flag;
-extern void *cusolver_dso_handle;
-#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)                   \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)   \
-  struct DynLoad__##__name {                                         \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                      \
-    cusolverStatus_t operator()(Args... args) {                      \
-      using cusolverFunc = decltype(&::__name);                      \
-      std::call_once(cusolver_dso_flag, []() {                       \
-        cusolver_dso_handle =                                        \
-            paddle::platform::dynload::GetCusolverDsoHandle();       \
-      });                                                            \
-      static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
-      return reinterpret_cast<cusolverFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
  extern DynLoad__##__name __name
 #define CUSOLVER_ROUTINE_EACH(__macro)  \
@@ -62,7 +48,7 @@ extern void *cusolver_dso_handle;
  __macro(cusolverDnCheevd);            \
  __macro(cusolverDnZheevd);
-CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
+CUSOLVER_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
 #if CUDA_VERSION >= 9020
 #define CUSOLVER_ROUTINE_EACH_R1(__macro) \
@@ -105,7 +91,7 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
  __macro(cusolverDnCungqr);              \
  __macro(cusolverDnZungqr);
-CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif
 #if CUDA_VERSION >= 9020
@@ -117,10 +103,10 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
  __macro(cusolverDnDsyevj);              \
  __macro(cusolverDnDestroySyevjInfo);
-CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif
-#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cusparse.cc
+++ b/paddle/fluid/platform/dynload/cusparse.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cusparse_dso_flag;
-void *cusparse_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 #ifdef CUSPARSE_ROUTINE_EACH

--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -17,28 +17,14 @@ limitations under the License. */
 #include <cusparse.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/cusparse.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cusparse_dso_flag;
-extern void *cusparse_dso_handle;
-#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                   \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)   \
-  struct DynLoad__##__name {                                         \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                      \
-    cusparseStatus_t operator()(Args... args) {                      \
-      using cusparseFunc = decltype(&::__name);                      \
-      std::call_once(cusparse_dso_flag, []() {                       \
-        cusparse_dso_handle =                                        \
-            paddle::platform::dynload::GetCusparseDsoHandle();       \
-      });                                                            \
-      static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
-      return reinterpret_cast<cusparseFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
  extern DynLoad__##__name __name
 #if defined(PADDLE_WITH_CUDA)
@@ -54,7 +40,7 @@ extern void *cusparse_dso_handle;
  __macro(cusparseSetMatType);         \
  __macro(cusparseSetMatIndexBase);
-CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
+CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
 // APIs available after CUDA 11.2
 #if CUDA_VERSION >= 11020
@@ -74,7 +60,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
  __macro(cusparseSparseToDense_bufferSize); \
  __macro(cusparseSparseToDense);
-CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 // APIs available after CUDA 11.3
 #if CUDA_VERSION >= 11030
@@ -83,13 +69,13 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
  __macro(cusparseSDDMM_preprocess);      \
  __macro(cusparseSDDMM);
-CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif
 #endif
 #endif
 #endif
-#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/dynload/hipfft.cc
+++ b/paddle/fluid/platform/dynload/hipfft.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag hipfft_dso_flag;
-void *hipfft_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/hipfft.h
+++ b/paddle/fluid/platform/dynload/hipfft.h
@@ -17,8 +17,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/hipfft.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
@@ -26,18 +25,8 @@ namespace dynload {
 extern std::once_flag hipfft_dso_flag;
 extern void *hipfft_dso_handle;
-#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)                             \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)     \
-  struct DynLoad__##__name {                                                 \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                              \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
-      using hipfftFunc = decltype(&::__name);                                \
-      std::call_once(hipfft_dso_flag, []() {                                 \
-        hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(hipfft_dso_handle, #__name);           \
-      return reinterpret_cast<hipfftFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
  extern DynLoad__##__name __name
 #define HIPFFT_FFT_ROUTINE_EACH(__macro) \
@@ -70,53 +59,8 @@ extern void *hipfft_dso_handle;
  __macro(hipfftGetVersion);             \
  __macro(hipfftGetProperty);
-HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
+HIPFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
-inline const char *hipfftGetErrorString(hipfftResult_t status) {
-  switch (status) {
-    case HIPFFT_SUCCESS:
-      return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
-    case HIPFFT_INVALID_PLAN:
-      return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
-    case HIPFFT_ALLOC_FAILED:
-      return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
-             "memory.";
-    case HIPFFT_INVALID_TYPE:
-      return "'HIPFFT_INVALID_TYPE'. No longer used.";
-    case HIPFFT_INVALID_VALUE:
-      return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
-             "parameter.";
-    case HIPFFT_INTERNAL_ERROR:
-      return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
-             "error.";
-    case HIPFFT_EXEC_FAILED:
-      return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
-    case HIPFFT_SETUP_FAILED:
-      return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
-    case HIPFFT_INVALID_SIZE:
-      return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
-    case HIPFFT_UNALIGNED_DATA:
-      return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
-    case HIPFFT_INCOMPLETE_PARAMETER_LIST:
-      return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
-    case HIPFFT_INVALID_DEVICE:
-      return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
-             "GPU than plan creation.";
-    case HIPFFT_PARSE_ERROR:
-      return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
-    case HIPFFT_NO_WORKSPACE:
-      return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
-             "plan execution.";
-    case HIPFFT_NOT_IMPLEMENTED:
-      return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
-             "functionality for parameters given.";
-    case HIPFFT_NOT_SUPPORTED:
-      return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
-             "parameters given.";
-    default:
-      return "HIPFFT_STATUS_UNKNOWN_ERROR";
-  }
-}
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/platform/dynload/hiprand.cc
+++ b/paddle/fluid/platform/dynload/hiprand.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag hiprand_dso_flag;
-void *hiprand_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/hiprand.h
+++ b/paddle/fluid/platform/dynload/hiprand.h
@@ -16,28 +16,15 @@ limitations under the License. */
 #include <hiprand.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/hiprand.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag hiprand_dso_flag;
-extern void *hiprand_dso_handle;
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                              \
-  struct DynLoad__##__name {                                                  \
-    template <typename... Args>                                               \
-    hiprandStatus_t operator()(Args... args) {                                \
-      using hiprandFunc = decltype(&::__name);                                \
-      std::call_once(hiprand_dso_flag, []() {                                 \
-        hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-      });                                                                     \
-      static void *p_##__name = dlsym(hiprand_dso_handle, #__name);           \
-      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);              \
-    }                                                                         \
-  };                                                                          \
  extern DynLoad__##__name __name
 #define HIPRAND_RAND_ROUTINE_EACH(__macro)      \
@@ -49,7 +36,7 @@ extern void *hiprand_dso_handle;
  __macro(hiprandGenerateNormal);               \
  __macro(hiprandDestroyGenerator);
-HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+HIPRAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/hiprtc.cc
+++ b/paddle/fluid/platform/dynload/hiprtc.cc
@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/hiprtc.h"
+#include "paddle/pten/backends/dynload/hiprtc.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag hiprtc_dso_flag;
-void* hiprtc_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
-bool HasNVRTC() {
+bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
-  std::call_once(hiprtc_dso_flag,
-                 []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
-  return hiprtc_dso_handle != nullptr;
-}
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/hiprtc.h
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -16,30 +16,17 @@ limitations under the License. */
 #include <hip/hiprtc.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/hiprtc.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag hiprtc_dso_flag;
-extern void* hiprtc_dso_handle;
 extern bool HasNVRTC();
-#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                            \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)     \
-  struct DynLoad__##__name {                                                \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                             \
+  extern DynLoad__##__name __name
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      using hiprtc_func = decltype(&::__name);                              \
-      std::call_once(hiprtc_dso_flag, []() {                                \
-        hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
-      });                                                                   \
-      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);          \
-      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern struct DynLoad__##__name __name
 /**
 * include all needed hiprtc functions
@@ -55,9 +42,9 @@ extern bool HasNVRTC();
  __macro(hiprtcGetProgramLog);      \
  __macro(hiprtcGetProgramLogSize)
-HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
+HIPRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
-#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/lapack.cc
+++ b/paddle/fluid/platform/dynload/lapack.cc
@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/lapack.h"
-#include <mutex>
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag lapack_dso_flag;
-void* lapack_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 LAPACK_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/lapack.h
+++ b/paddle/fluid/platform/dynload/lapack.h
@@ -16,122 +16,20 @@ limitations under the License. */
 #include <complex>
 #include <mutex>
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/backends/dynload/lapack.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/common/complex.h"
-#include "paddle/fluid/platform/port.h"
-// Note(zhouwei): because lapack doesn't provide appropriate header file.
-// should expose API statement yourself.
-// getrf_(For example)
-extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
-                        int *info);
-extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
-                        int *info);
-// evd
-extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
-                        int *lda, double *w, std::complex<double> *work,
-                        int *lwork, double *rwork, int *lrwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
-                        int *lda, float *w, std::complex<float> *work,
-                        int *lwork, float *rwork, int *lrwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
-                        double *w, double *work, int *lwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
-                        float *w, float *work, int *lwork, int *iwork,
-                        int *liwork, int *info);
-// geev
-extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
-                       double *wr, double *wi, double *vl, int *ldvl,
-                       double *vr, int *ldvr, double *work, int *lwork,
-                       int *info);
-extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
-                       float *wr, float *wi, float *vl, int *ldvl, float *vr,
-                       int *ldvr, float *work, int *lwork, int *info);
-extern "C" void zgeev_(char *jobvl, char *jobvr, int *n,
-                       std::complex<double> *a, int *lda,
-                       std::complex<double> *w, std::complex<double> *vl,
-                       int *ldvl, std::complex<double> *vr, int *ldvr,
-                       std::complex<double> *work, int *lwork, double *rwork,
-                       int *info);
-extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
-                       int *lda, std::complex<float> *w,
-                       std::complex<float> *vl, int *ldvl,
-                       std::complex<float> *vr, int *ldvr,
-                       std::complex<float> *work, int *lwork, float *rwork,
-                       int *info);
-// gels
-extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs, double *a,
-                       int *lda, double *b, int *ldb, double *work, int *lwork,
-                       int *info);
-extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs, float *a,
-                       int *lda, float *b, int *ldb, float *work, int *lwork,
-                       int *info);
-// gelsd
-extern "C" void dgelsd_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, double *s, double *rcond,
-                        int *rank, double *work, int *lwork, int *iwork,
-                        int *info);
-extern "C" void sgelsd_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, float *s, float *rcond, int *rank,
-                        float *work, int *lwork, int *iwork, int *info);
-// gelsy
-extern "C" void dgelsy_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, int *jpvt, double *rcond,
-                        int *rank, double *work, int *lwork, int *info);
-extern "C" void sgelsy_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, int *jpvt, float *rcond, int *rank,
-                        float *work, int *lwork, int *info);
-// gelss
-extern "C" void dgelss_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, double *s, double *rcond,
-                        int *rank, double *work, int *lwork, int *info);
-extern "C" void sgelss_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, float *s, float *rcond, int *rank,
-                        float *work, int *lwork, int *info);
-extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a,
-                        int *lda, std::complex<double> *b, int *ldb, int *info);
-extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a,
-                        int *lda, std::complex<float> *b, int *ldb, int *info);
-extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, int *info);
-extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda,
-                        float *b, int *ldb, int *info);
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag lapack_dso_flag;
-extern void *lapack_dso_handle;
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load lapack routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                     \
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                      \
-  struct DynLoad__##__name {                                                 \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                              \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
-      using lapackFunc = decltype(&::__name);                                \
-      std::call_once(lapack_dso_flag, []() {                                 \
-        lapack_dso_handle = paddle::platform::dynload::GetLAPACKDsoHandle(); \
-      });                                                                    \
-      static void *p_##_name = dlsym(lapack_dso_handle, #__name);            \
-      return reinterpret_cast<lapackFunc>(p_##_name)(args...);               \
-    }                                                                        \
-  };                                                                         \
  extern DynLoad__##__name __name
 #define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \

--- a/paddle/fluid/platform/dynload/miopen.cc
+++ b/paddle/fluid/platform/dynload/miopen.cc
@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/miopen.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag miopen_dso_flag;
-void* miopen_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
@@ -50,19 +48,7 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
-bool HasCUDNN() {
+bool HasCUDNN() { return pten::dynload::HasCUDNN(); }
-  std::call_once(miopen_dso_flag,
-                 []() { miopen_dso_handle = GetCUDNNDsoHandle(); });
-  return miopen_dso_handle != nullptr;
-}
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      miopen_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load miopen shared library. Cannot invoke method %s.",
-          fn_name));
-}
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -18,66 +18,17 @@ limitations under the License. */
 #include <miopen/miopen.h>
 #include <miopen/version.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/miopen.h"
-#include "paddle/fluid/platform/port.h"
-#define MIOPEN_VERSION                                       \
-  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
-   MIOPEN_VERSION_PATCH)  // NOLINT
-// MIOPEN only support NCHW, just for compatibility with CUDNN API
-typedef enum {
-  MIOPEN_TENSOR_NCHW = 0,
-  MIOPEN_TENSOR_NHWC = 1,
-} miopenTensorFormat_t;
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag miopen_dso_flag;
-extern void* miopen_dso_handle;
 extern bool HasCUDNN();
-inline const char* miopenGetErrorString(miopenStatus_t status) {
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)     \
-  switch (status) {
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    case miopenStatusSuccess:
+  extern DynLoad__##__name __name
-      return "MIOPEN_STATUS_SUCCESS";
-    case miopenStatusNotInitialized:
-      return "MIOPEN_STATUS_NOT_INITIALIZED";
-    case miopenStatusInvalidValue:
-      return "MIOPEN_STATUS_INVALID_VALUE";
-    case miopenStatusBadParm:
-      return "MIOPEN_STATUS_BAD_PARAM";
-    case miopenStatusAllocFailed:
-      return "MIOPEN_STATUS_ALLOC_FAILED";
-    case miopenStatusInternalError:
-      return "MIOPEN_STATUS_INTERNAL_ERROR";
-    case miopenStatusNotImplemented:
-      return "MIOPEN_STATUS_NOT_IMPLEMENTED";
-    case miopenStatusUnsupportedOp:
-      return "MIOPEN_STATUS_UNSUPPORTED_OP";
-    case miopenStatusUnknownError:
-    default:
-      return "MIOPEN_STATUS_UNKNOWN_ERROR";
-  }
-}
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)                            \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      using miopen_func = decltype(&::__name);                              \
-      std::call_once(miopen_dso_flag, []() {                                \
-        miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
-      });                                                                   \
-      EnforceCUDNNLoaded(#__name);                                          \
-      static void* p_##__name = dlsym(miopen_dso_handle, #__name);          \
-      return reinterpret_cast<miopen_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern struct DynLoad__##__name __name
 /**
 * include all needed miopen functions in HPPL
@@ -145,23 +96,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(miopenRNNForwardInference);                     \
  __macro(miopenGetTensorNumBytes);
-MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 #define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
  __macro(miopenConvolutionBackwardData);
-MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 // APIs available after R3:
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
  __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 // APIs available after R4:
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
  __macro(miopenBatchNormalizationForwardTraining);  \
  __macro(miopenBatchNormalizationForwardInference); \
  __macro(miopenBatchNormalizationBackward);
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 // APIs in R5
 #define MIOPEN_DNN_ROUTINE_EACH_R5(__macro)  \
@@ -169,12 +120,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
  __macro(miopenSetActivationDescriptor);    \
  __macro(miopenGetActivationDescriptor);    \
  __macro(miopenDestroyActivationDescriptor);
-MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R5(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 // APIs in R6
 #define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
 /*__macro(miopenSetRNNDescriptor_v6);*/
-MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R6(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 #define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
  __macro(miopenSetConvolutionGroupCount);  \
@@ -184,7 +135,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
  __macro(miopenSetCTCLossDescriptor);      \
  __macro(miopenGetCTCLossWorkspaceSize);   \
  __macro(miopenCTCLoss);
-MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                    \
 /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
@@ -192,7 +143,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx);                   \
 __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);          \
 __macro(cudnnBatchNormalizationBackwardEx);                          \
 __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/mklml.cc
+++ b/paddle/fluid/platform/dynload/mklml.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag mklml_dso_flag;
-void* mklml_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 MKLML_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -17,36 +17,23 @@ limitations under the License. */
 #include <mkl.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/mklml.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag mklml_dso_flag;
-extern void *mklml_dso_handle;
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load mklml routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                       \
-  struct DynLoad__##__name {                                               \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using mklmlFunc = decltype(&::__name);                               \
-      std::call_once(mklml_dso_flag, []() {                                \
-        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
-      });                                                                  \
-      static void *p_##_name = dlsym(mklml_dso_handle, #__name);           \
-      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
-    }                                                                      \
-  };                                                                       \
  extern DynLoad__##__name __name
-#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
+  DYNAMIC_LOAD_MKLML_WRAP(__name)
 #define MKLML_ROUTINE_EACH(__macro) \
  __macro(cblas_sgemm);             \
@@ -111,7 +98,7 @@ extern void *mklml_dso_handle;
  __macro(MKL_Set_Num_Threads);     \
  __macro(MKL_Get_Max_Threads);
-MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
 #if !defined(_WIN32)
 DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);

--- a/paddle/fluid/platform/dynload/mklrt.h
+++ b/paddle/fluid/platform/dynload/mklrt.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace paddle {
 namespace platform {
@@ -32,18 +32,8 @@ extern void* mklrt_dso_handle;
 * (for each function) to dynamic load mkldfti routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                                    \
+#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                       \
-  struct DynLoad__##__name {                                               \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using mklrtFunc = decltype(&::__name);                               \
-      std::call_once(mklrt_dso_flag, []() {                                \
-        mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \
-      });                                                                  \
-      static void* p_##__name = dlsym(mklrt_dso_handle, #__name);          \
-      return reinterpret_cast<mklrtFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
  extern DynLoad__##__name __name
 // mkl_dfti.h has a macro that shadows the function with the same name

--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag nccl_dso_flag;
-void *nccl_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -16,28 +16,14 @@ limitations under the License. */
 #include <nccl.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/nccl.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag nccl_dso_flag;
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)       \
-extern void* nccl_dso_handle;
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(&::__name);                             \
-      std::call_once(nccl_dso_flag, []() {                               \
-        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
  extern DynLoad__##__name __name
 #define NCCL_RAND_ROUTINE_EACH(__macro) \
@@ -57,30 +43,30 @@ extern void* nccl_dso_handle;
  __macro(ncclReduceScatter);           \
  __macro(ncclGetErrorString);
-NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #if NCCL_VERSION_CODE >= 2212
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
-NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 #if NCCL_VERSION_CODE >= 2304
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
-NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 #if NCCL_VERSION_CODE >= 2703
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
  __macro(ncclSend);                               \
  __macro(ncclRecv);
-NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 #if NCCL_VERSION_CODE >= 21100
 #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
  __macro(ncclRedOpCreatePreMulSum);                \
  __macro(ncclRedOpDestroy);
-NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/nvjpeg.cc
+++ b/paddle/fluid/platform/dynload/nvjpeg.cc
@@ -15,9 +15,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag nvjpeg_dso_flag;
-void *nvjpeg_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/nvjpeg.h
+++ b/paddle/fluid/platform/dynload/nvjpeg.h
@@ -14,27 +14,14 @@ limitations under the License. */
 #include <nvjpeg.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/nvjpeg.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag nvjpeg_dso_flag;
-extern void *nvjpeg_dso_handle;
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    nvjpegStatus_t operator()(Args... args) {                                \
-      using nvjpegFunc = decltype(&::__name);                                \
-      std::call_once(nvjpeg_dso_flag, []() {                                 \
-        nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name);           \
-      return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
  extern DynLoad__##__name __name
 #define NVJPEG_RAND_ROUTINE_EACH(__macro) \
@@ -44,7 +31,7 @@ extern void *nvjpeg_dso_handle;
  __macro(nvjpegJpegStateDestroy);        \
  __macro(nvjpegDecode);
-NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
+NVJPEG_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ b/paddle/fluid/platform/dynload/nvrtc.cc
@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/nvrtc.h"
+#include "paddle/pten/backends/dynload/nvrtc.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag nvrtc_dso_flag;
-void* nvrtc_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 NVRTC_ROUTINE_EACH(DEFINE_WRAP);
-bool HasNVRTC() {
+bool HasNVRTC() { return pten::dynload::HasNVRTC(); }
-  std::call_once(nvrtc_dso_flag,
-                 []() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
-  return nvrtc_dso_handle != nullptr;
-}
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <nvrtc.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/nvrtc.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag nvrtc_dso_flag;
-extern void* nvrtc_dso_handle;
 extern bool HasNVRTC();
-#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                            \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)      \
-  struct DynLoad__##__name {                                               \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                            \
+  extern DynLoad__##__name __name
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using nvrtc_func = decltype(&::__name);                              \
-      std::call_once(nvrtc_dso_flag, []() {                                \
-        nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
-      });                                                                  \
-      static void* p_##__name = dlsym(nvrtc_dso_handle, #__name);          \
-      return reinterpret_cast<nvrtc_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
 /**
 * include all needed nvrtc functions
@@ -56,9 +43,9 @@ extern bool HasNVRTC();
  __macro(nvrtcGetProgramLog);      \
  __macro(nvrtcGetProgramLogSize)
-NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+NVRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
-#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/nvtx.cc
+++ b/paddle/fluid/platform/dynload/nvtx.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag nvtx_dso_flag;
-void *nvtx_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 NVTX_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/nvtx.h
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -17,36 +17,23 @@ limitations under the License. */
 #include <nvToolsExt.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/nvtx.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag nvtx_dso_flag;
-extern void *nvtx_dso_handle;
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    int operator()(Args... args) {                                       \
-      using nvtxFunc = decltype(&::__name);                              \
-      std::call_once(nvtx_dso_flag, []() {                               \
-        nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
-      });                                                                \
-      static void *p_##__name = dlsym(nvtx_dso_handle, #__name);         \
-      return reinterpret_cast<nvtxFunc>(p_##__name)(args...);            \
-    }                                                                    \
-  };                                                                     \
  extern DynLoad__##__name __name
 #define NVTX_ROUTINE_EACH(__macro) \
  __macro(nvtxRangePushA);         \
  __macro(nvtxRangePop);
-NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
-#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/platform/dynload/rccl.cc
+++ b/paddle/fluid/platform/dynload/rccl.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag rccl_dso_flag;
-void *rccl_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -16,28 +16,14 @@ limitations under the License. */
 #include <rccl.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/rccl.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag rccl_dso_flag;
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)       \
-extern void* rccl_dso_handle;
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(&::__name);                             \
-      std::call_once(rccl_dso_flag, []() {                               \
-        rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(rccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
  extern DynLoad__##__name __name
 #define RCCL_RAND_ROUTINE_EACH(__macro) \
@@ -57,18 +43,18 @@ extern void* rccl_dso_handle;
  __macro(ncclReduceScatter);           \
  __macro(ncclGetErrorString);
-RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #if NCCL_VERSION_CODE >= 2212
 #define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
-RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 #if NCCL_VERSION_CODE >= 2703
 #define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
  __macro(ncclSend);                               \
  __macro(ncclRecv);
-RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/rocblas.cc
+++ b/paddle/fluid/platform/dynload/rocblas.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag rocblas_dso_flag;
-void *rocblas_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

--- a/paddle/fluid/platform/dynload/rocblas.h
+++ b/paddle/fluid/platform/dynload/rocblas.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag rocblas_dso_flag;
-extern void *rocblas_dso_handle;
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load cublas routine
@@ -36,18 +32,8 @@ extern void *rocblas_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)                             \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)    \
-  struct DynLoad__##__name {                                                  \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                               \
-    rocblas_status operator()(Args... args) {                                 \
-      using rocblas_func = decltype(&::__name);                               \
-      std::call_once(rocblas_dso_flag, []() {                                 \
-        rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
-      });                                                                     \
-      static void *p_##__name = dlsym(rocblas_dso_handle, #__name);           \
-      return reinterpret_cast<rocblas_func>(p_##__name)(args...);             \
-    }                                                                         \
-  };                                                                          \
  extern DynLoad__##__name __name
 #define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
@@ -83,7 +69,7 @@ extern void *rocblas_dso_handle;
  __macro(rocblas_set_pointer_mode);       \
  __macro(rocblas_get_pointer_mode);
-ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
 // APIs available after CUDA 8.0
 #define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
@@ -94,21 +80,21 @@ ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
  __macro(rocblas_zgemm_strided_batched);     \
  __macro(rocblas_hgemm_strided_batched);
-ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
 // HIP not supported in ROCM3.5
 // #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
 //   __macro(cublasSetMathMode);
 //   __macro(cublasGetMathMode);
-// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+// ROCBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
 #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
  __macro(rocblas_gemm_batched_ex);           \
  __macro(rocblas_gemm_strided_batched_ex);
-ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
-#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/rocm_driver.cc
+++ b/paddle/fluid/platform/dynload/rocm_driver.cc
@@ -13,22 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/rocm_driver.h"
+#include "paddle/pten/backends/dynload/rocm_driver.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag rocm_dso_flag;
-void* rocm_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 ROCM_ROUTINE_EACH(DEFINE_WRAP);
-bool HasCUDADriver() {
+bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }
-  std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
-  return rocm_dso_handle != nullptr;
-}
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/rocm_driver.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag rocm_dso_flag;
-extern void* rocm_dso_handle;
 extern bool HasCUDADriver();
-#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)                           \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)       \
-  struct DynLoad__##__name {                                             \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                          \
+  extern DynLoad__##__name __name
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
-      using rocm_func = decltype(&::__name);                             \
-      std::call_once(rocm_dso_flag, []() {                               \
-        rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(rocm_dso_handle, #__name);         \
-      return reinterpret_cast<rocm_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern struct DynLoad__##__name __name
 /**
 * include all needed cuda driver functions
@@ -59,9 +46,9 @@ extern bool HasCUDADriver();
  __macro(hipGetDeviceCount);                                 \
  __macro(hipDevicePrimaryCtxGetState)
-ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
+ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
-#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/warpctc.cc
+++ b/paddle/fluid/platform/dynload/warpctc.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 WARPCTC_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -16,34 +16,19 @@ limitations under the License. */
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/warpctc.h"
-#include "paddle/fluid/platform/port.h"
-#include "warpctc/include/ctc.h"
 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag warpctc_dso_flag;
-extern void* warpctc_dso_handle;
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load warpctc routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                     \
-  struct DynLoad__##__name {                                                   \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
-      using warpctcFunc = decltype(&::__name);                                 \
-      std::call_once(warpctc_dso_flag, []() {                                  \
-        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
-      });                                                                      \
-      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);             \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
-    }                                                                          \
-  };                                                                           \
  extern DynLoad__##__name __name
 #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -65,30 +65,30 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
+#include "paddle/pten/backends/dynload/port.h"
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cublas.h"
-#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
-#include "paddle/fluid/platform/dynload/curand.h"
+#include "paddle/pten/backends/dynload/curand.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
+#include "paddle/pten/backends/dynload/cusolver.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include <error.h>
-#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/pten/backends/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 #ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/hipfft.h"
+#include "paddle/pten/backends/dynload/hipfft.h"
-#include "paddle/fluid/platform/dynload/hiprand.h"
+#include "paddle/pten/backends/dynload/hiprand.h"
-#include "paddle/fluid/platform/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/miopen.h"
-#include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include <error.h>  // NOLINT
-#include "paddle/fluid/platform/dynload/rccl.h"
+#include "paddle/pten/backends/dynload/rccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_HIP
@@ -880,7 +880,7 @@ inline bool is_error(cudnnStatus_t stat) {
 inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
  std::ostringstream sout;
  sout << "CUDNN error(" << stat << "), "
-       << platform::dynload::cudnnGetErrorString(stat) << ". "
+       << pten::dynload::cudnnGetErrorString(stat) << ". "
       << GetExternalErrorMsg(stat);
  return sout.str();
 }
@@ -945,7 +945,7 @@ inline bool is_error(ncclResult_t nccl_result) {
 inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
  std::ostringstream sout;
  sout << "NCCL error(" << nccl_result << "), "
-       << platform::dynload::ncclGetErrorString(nccl_result) << ". ";
+       << pten::dynload::ncclGetErrorString(nccl_result) << ". ";
  if (errno == ENOSPC || errno == EAGAIN) {
    std::string detail(strerror(errno));
    detail += "\nPlease try one of the following solutions:";
@@ -1090,7 +1090,7 @@ inline bool is_error(miopenStatus_t stat) {
 inline std::string build_rocm_error_msg(miopenStatus_t stat) {
  std::string msg(" Miopen error, ");
-  return msg + platform::dynload::miopenGetErrorString(stat) + " ";
+  return msg + pten::dynload::miopenGetErrorString(stat) + " ";
 }
 /***** ROCBLAS ERROR *****/
@@ -1132,7 +1132,7 @@ inline bool is_error(ncclResult_t nccl_result) {
 inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
  std::string msg(" Rccl error, ");
-  return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
+  return msg + pten::dynload::ncclGetErrorString(nccl_result) + " ";
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL
@@ -1141,7 +1141,7 @@ inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; }
 inline std::string build_rocm_error_msg(hipfftResult_t stat) {
  std::string msg(" HIPFFT error, ");
-  return msg + platform::dynload::hipfftGetErrorString(stat) + " ";
+  return msg + pten::dynload::hipfftGetErrorString(stat) + " ";
 }
 namespace details {

--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include <unistd.h>
 #elif defined(_MSC_VER)
 #include <processthreadsapi.h>
+#else
+#include <unistd.h>
 #endif
 #include "paddle/fluid/platform/macros.h"  // import DISABLE_COPY_AND_ASSIGN

--- a/paddle/fluid/platform/os_info.h
+++ b/paddle/fluid/platform/os_info.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #ifdef _POSIX_C_SOURCE
 #include <time.h>
 #endif
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stdlib.h>
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"
 #ifdef _WIN32
 static unsigned sleep(unsigned seconds) {

--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
+add_subdirectory(dynload)
 add_subdirectory(cpu)
 cc_library(pten_context SRCS all_context.cc DEPS device_context)
--- a/paddle/pten/backends/dynload/CMakeLists.txt
+++ b/paddle/pten/backends/dynload/CMakeLists.txt
+cc_library(pten_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
+list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
+if (NOT WITH_NV_JETSON)
+  list(APPEND CUDA_SRCS nvjpeg.cc)
+endif()
+if (WITH_ROCM)
+  list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
+endif()
+# There is no macOS version of NCCL.
+# Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
+if (NOT APPLE)
+  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+  if (WITH_NCCL)
+    list(APPEND CUDA_SRCS nccl.cc)
+  endif()
+  if (WITH_ROCM)
+    list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
+    if (WITH_RCCL)
+      list(APPEND HIP_SRCS rccl.cc)
+    endif()
+  endif()
+endif()
+if (TENSORRT_FOUND)
+  list(APPEND CUDA_SRCS tensorrt.cc)
+endif()
+configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
+if (CUPTI_FOUND)
+  list(APPEND CUDA_SRCS cupti.cc)
+endif(CUPTI_FOUND)
+if(WITH_ROCM)
+  hip_library(pten_dynload_cuda SRCS ${HIP_SRCS} DEPS pten_dynamic_loader)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+elseif (WITH_ASCEND_CL)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc npu_hccl)
+else()
+  nv_library(pten_dynload_cuda SRCS ${CUDA_SRCS} DEPS pten_dynamic_loader)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+endif()
+if (WITH_MKLML)
+  cc_library(pten_dynload_mklml SRCS mklml.cc DEPS pten_dynamic_loader mklml)
+endif()
+cc_library(pten_dynload_lapack SRCS lapack.cc DEPS pten_dynamic_loader)
+add_dependencies(pten_dynload_lapack extern_lapack)
+# TODO(TJ): add iomp, mkldnn?
+if (MKL_FOUND AND WITH_ONEMKL)
+  message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
+  cc_library(pten_dynload_mklrt SRCS mklrt.cc DEPS pten_dynamic_loader)
+  target_include_directories(pten_dynload_mklrt PRIVATE ${MKL_INCLUDE})
+endif()
--- a/paddle/pten/backends/dynload/cublas.cc
+++ b/paddle/pten/backends/dynload/cublas.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/cublas.h"
+namespace pten {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
+CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
+CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cublas.h
+++ b/paddle/pten/backends/dynload/cublas.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cublasXt.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublas_func =                                                   \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublas_dso_flag, []() {                                \
+        cublas_dso_handle = pten::dynload::GetCublasDsoHandle();            \
+      });                                                                   \
+      static void *p_##__name = dlsym(cublas_dso_handle, #__name);          \
+      return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSaxpy_v2);                \
+  __macro(cublasDaxpy_v2);                \
+  __macro(cublasCaxpy_v2);                \
+  __macro(cublasZaxpy_v2);                \
+  __macro(cublasSscal_v2);                \
+  __macro(cublasDscal_v2);                \
+  __macro(cublasScopy_v2);                \
+  __macro(cublasDcopy_v2);                \
+  __macro(cublasSgemv_v2);                \
+  __macro(cublasDgemv_v2);                \
+  __macro(cublasCgemv_v2);                \
+  __macro(cublasZgemv_v2);                \
+  __macro(cublasSgemm_v2);                \
+  __macro(cublasDgemm_v2);                \
+  __macro(cublasCgemm_v2);                \
+  __macro(cublasZgemm_v2);                \
+  __macro(cublasHgemm);                   \
+  __macro(cublasSgemmEx);                 \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
+  __macro(cublasStrsm_v2);                \
+  __macro(cublasDtrsm_v2);                \
+  __macro(cublasCtrsm_v2);                \
+  __macro(cublasZtrsm_v2);                \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasStrsmBatched);            \
+  __macro(cublasDtrsmBatched);            \
+  __macro(cublasCtrsmBatched);            \
+  __macro(cublasZtrsmBatched);            \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched);           \
+  __macro(cublasSmatinvBatched);          \
+  __macro(cublasDmatinvBatched);          \
+  __macro(cublasSgetrsBatched);           \
+  __macro(cublasDgetrsBatched);
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+// APIs available after CUDA 8.0
+#if CUDA_VERSION >= 8000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(cublasGemmEx);                     \
+  __macro(cublasSgemmStridedBatched);        \
+  __macro(cublasDgemmStridedBatched);        \
+  __macro(cublasCgemmStridedBatched);        \
+  __macro(cublasZgemmStridedBatched);        \
+  __macro(cublasHgemmStridedBatched);
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+// APIs available after CUDA 9.0
+#if CUDA_VERSION >= 9000
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
+  __macro(cublasSetMathMode);                \
+  __macro(cublasGetMathMode);
+CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+// APIs available after CUDA 9.1
+#if CUDA_VERSION >= 9010
+#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(cublasGemmBatchedEx);              \
+  __macro(cublasGemmStridedBatchedEx);
+CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cublasLt.cc
+++ b/paddle/pten/backends/dynload/cublasLt.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/cublasLt.h"
+namespace pten {
+namespace dynload {
+std::once_flag cublasLt_dso_flag;
+void *cublasLt_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cublasLt.h
+++ b/paddle/pten/backends/dynload/cublasLt.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cublasLt.h>
+#include <cuda.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag cublasLt_dso_flag;
+extern void *cublasLt_dso_handle;
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublasLt routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublasLt_func =                                                 \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublasLt_dso_flag, []() {                              \
+        cublasLt_dso_handle = pten::dynload::GetCublasLtDsoHandle();        \
+      });                                                                   \
+      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
+      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+// APIs available after CUDA 10.1
+// #if CUDA_VERSION >= 10100
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)    \
+  __macro(cublasLtCreate);                     \
+  __macro(cublasLtDestroy);                    \
+  __macro(cublasLtMatmul);                     \
+  __macro(cublasLtMatmulDescCreate);           \
+  __macro(cublasLtMatmulDescDestroy);          \
+  __macro(cublasLtMatmulDescSetAttribute);     \
+  __macro(cublasLtMatrixLayoutCreate);         \
+  __macro(cublasLtMatrixLayoutDestroy);        \
+  __macro(cublasLtMatrixLayoutSetAttribute);   \
+  __macro(cublasLtMatrixTransform);            \
+  __macro(cublasLtMatrixTransformDescCreate);  \
+  __macro(cublasLtMatrixTransformDescDestroy); \
+  __macro(cublasLtMatrixTransformDescSetAttribute);
+CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+// #endif
+#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cuda_driver.cc
+++ b/paddle/pten/backends/dynload/cuda_driver.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/cuda_driver.h"
+namespace pten {
+namespace dynload {
+std::once_flag cuda_dso_flag;
+void* cuda_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+#if CUDA_VERSION >= 10020
+CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
+#endif
+CUDA_ROUTINE_EACH(DEFINE_WRAP);
+bool HasCUDADriver() {
+  std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
+  return cuda_dso_handle != nullptr;
+}
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cuda_driver.h
+++ b/paddle/pten/backends/dynload/cuda_driver.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cuda.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag cuda_dso_flag;
+extern void* cuda_dso_handle;
+extern bool HasCUDADriver();
+#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)                       \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cuda_func = decltype(&::__name);                         \
+      std::call_once(cuda_dso_flag, []() {                           \
+        cuda_dso_handle = pten::dynload::GetCUDADsoHandle();         \
+      });                                                            \
+      static void* p_##__name = dlsym(cuda_dso_handle, #__name);     \
+      return reinterpret_cast<cuda_func>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+/**
+ * include all needed cuda driver functions
+ **/
+#define CUDA_ROUTINE_EACH(__macro)                      \
+  __macro(cuInit);                                      \
+  __macro(cuDriverGetVersion);                          \
+  __macro(cuGetErrorString);                            \
+  __macro(cuModuleLoadData);                            \
+  __macro(cuModuleGetFunction);                         \
+  __macro(cuModuleUnload);                              \
+  __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(cuLaunchKernel);                              \
+  __macro(cuCtxCreate);                                 \
+  __macro(cuCtxGetCurrent);                             \
+  __macro(cuDeviceGetCount);                            \
+  __macro(cuDevicePrimaryCtxGetState);                  \
+  __macro(cuDeviceGetAttribute);                        \
+  __macro(cuDeviceGet)
+#if CUDA_VERSION >= 10020
+#define CUDA_ROUTINE_EACH_VVM(__macro)    \
+  __macro(cuMemGetAllocationGranularity); \
+  __macro(cuMemAddressReserve);           \
+  __macro(cuMemCreate);                   \
+  __macro(cuMemMap);                      \
+  __macro(cuMemSetAccess);                \
+  __macro(cuMemUnmap);                    \
+  __macro(cuMemRelease);                  \
+  __macro(cuMemAddressFree)
+CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+#endif
+CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cudnn.cc
+++ b/paddle/pten/backends/dynload/cudnn.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace pten {
+namespace dynload {
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
+#endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
+#endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
+#endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_R8
+CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
+#endif
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag,
+                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
+  return cudnn_dso_handle != nullptr;
+}
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      cudnn_dso_handle,
+      paddle::platform::errors::PreconditionNotMet(
+          "Cannot load cudnn shared library. Cannot invoke method %s.",
+          fn_name));
+}
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cudnn.h
+++ b/paddle/pten/backends/dynload/cudnn.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cudnn_func = decltype(&::__name);                        \
+      std::call_once(cudnn_dso_flag, []() {                          \
+        cudnn_dso_handle = pten::dynload::GetCUDNNDsoHandle();       \
+      });                                                            \
+      EnforceCUDNNLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);    \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
+  __macro(cudnnSetTensor4dDescriptor);                     \
+  __macro(cudnnSetTensor4dDescriptorEx);                   \
+  __macro(cudnnSetTensorNdDescriptor);                     \
+  __macro(cudnnGetTensorNdDescriptor);                     \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
+  __macro(cudnnCreateTensorDescriptor);                    \
+  __macro(cudnnDestroyTensorDescriptor);                   \
+  __macro(cudnnCreateFilterDescriptor);                    \
+  __macro(cudnnSetFilter4dDescriptor);                     \
+  __macro(cudnnSetFilterNdDescriptor);                     \
+  __macro(cudnnGetFilterNdDescriptor);                     \
+  __macro(cudnnSetPooling2dDescriptor);                    \
+  __macro(cudnnSetPoolingNdDescriptor);                    \
+  __macro(cudnnGetPoolingNdDescriptor);                    \
+  __macro(cudnnDestroyFilterDescriptor);                   \
+  __macro(cudnnCreateConvolutionDescriptor);               \
+  __macro(cudnnCreatePoolingDescriptor);                   \
+  __macro(cudnnDestroyPoolingDescriptor);                  \
+  __macro(cudnnSetConvolution2dDescriptor);                \
+  __macro(cudnnDestroyConvolutionDescriptor);              \
+  __macro(cudnnSetConvolutionNdDescriptor);                \
+  __macro(cudnnGetConvolutionNdDescriptor);                \
+  __macro(cudnnDeriveBNTensorDescriptor);                  \
+  __macro(cudnnCreateSpatialTransformerDescriptor);        \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
+  __macro(cudnnDestroySpatialTransformerDescriptor);       \
+  __macro(cudnnSpatialTfGridGeneratorForward);             \
+  __macro(cudnnSpatialTfGridGeneratorBackward);            \
+  __macro(cudnnSpatialTfSamplerForward);                   \
+  __macro(cudnnSpatialTfSamplerBackward);                  \
+  __macro(cudnnCreate);                                    \
+  __macro(cudnnDestroy);                                   \
+  __macro(cudnnSetStream);                                 \
+  __macro(cudnnActivationForward);                         \
+  __macro(cudnnActivationBackward);                        \
+  __macro(cudnnConvolutionForward);                        \
+  __macro(cudnnConvolutionBackwardBias);                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
+  __macro(cudnnTransformTensor);                           \
+  __macro(cudnnPoolingForward);                            \
+  __macro(cudnnPoolingBackward);                           \
+  __macro(cudnnSoftmaxBackward);                           \
+  __macro(cudnnSoftmaxForward);                            \
+  __macro(cudnnGetVersion);                                \
+  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
+  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
+  __macro(cudnnGetErrorString);                            \
+  __macro(cudnnCreateDropoutDescriptor);                   \
+  __macro(cudnnDropoutGetStatesSize);                      \
+  __macro(cudnnSetDropoutDescriptor);                      \
+  __macro(cudnnRestoreDropoutDescriptor);                  \
+  __macro(cudnnCreateRNNDescriptor);                       \
+  __macro(cudnnGetRNNParamsSize);                          \
+  __macro(cudnnGetRNNWorkspaceSize);                       \
+  __macro(cudnnGetRNNTrainingReserveSize);                 \
+  __macro(cudnnRNNForwardTraining);                        \
+  __macro(cudnnRNNBackwardData);                           \
+  __macro(cudnnRNNBackwardWeights);                        \
+  __macro(cudnnRNNForwardInference);                       \
+  __macro(cudnnDestroyDropoutDescriptor);                  \
+  __macro(cudnnDestroyRNNDescriptor);                      \
+  __macro(cudnnSetTensorNdDescriptorEx);                   \
+  __macro(cudnnAddTensor);                                 \
+  __macro(cudnnConvolutionBackwardData);                   \
+  __macro(cudnnConvolutionBackwardFilter);                 \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
+  __macro(cudnnBatchNormalizationForwardTraining);         \
+  __macro(cudnnBatchNormalizationForwardInference);        \
+  __macro(cudnnBatchNormalizationBackward);                \
+  __macro(cudnnCreateActivationDescriptor);                \
+  __macro(cudnnSetActivationDescriptor);                   \
+  __macro(cudnnGetActivationDescriptor);                   \
+  __macro(cudnnDestroyActivationDescriptor);               \
+  __macro(cudnnSetRNNDescriptor_v6);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);          \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
+  __macro(cudnnSetRNNDescriptor);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                \
+  __macro(cudnnSetConvolutionGroupCount);                 \
+  __macro(cudnnSetConvolutionMathType);                   \
+  __macro(cudnnConvolutionBiasActivationForward);         \
+  __macro(cudnnCreateCTCLossDescriptor);                  \
+  __macro(cudnnDestroyCTCLossDescriptor);                 \
+  __macro(cudnnGetCTCLossDescriptor);                     \
+  __macro(cudnnSetCTCLossDescriptor);                     \
+  __macro(cudnnGetCTCLossWorkspaceSize);                  \
+  __macro(cudnnCTCLoss);                                  \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
+  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+#if CUDNN_VERSION >= 7201
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
+  __macro(cudnnCreateRNNDataDescriptor);             \
+  __macro(cudnnDestroyRNNDataDescriptor);            \
+  __macro(cudnnSetRNNDataDescriptor);                \
+  __macro(cudnnSetRNNPaddingMode);                   \
+  __macro(cudnnRNNForwardTrainingEx);                \
+  __macro(cudnnRNNBackwardDataEx);                   \
+  __macro(cudnnRNNBackwardWeightsEx);                \
+  __macro(cudnnRNNForwardInferenceEx);
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+#if CUDNN_VERSION >= 7401
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
+  __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
+  __macro(cudnnBatchNormalizationForwardTrainingEx);                 \
+  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
+  __macro(cudnnBatchNormalizationBackwardEx);                        \
+  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+#if CUDNN_VERSION >= 8000
+#define CUDNN_DNN_ROUTINE_EACH_R8(__macro)            \
+  __macro(cudnnSetRNNDescriptor_v8);                  \
+  __macro(cudnnCreateFusedOpsPlan);                   \
+  __macro(cudnnCreateFusedOpsConstParamPack);         \
+  __macro(cudnnCreateFusedOpsVariantParamPack);       \
+  __macro(cudnnDestroyFusedOpsPlan);                  \
+  __macro(cudnnDestroyFusedOpsConstParamPack);        \
+  __macro(cudnnDestroyFusedOpsVariantParamPack);      \
+  __macro(cudnnFusedOpsExecute);                      \
+  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
+  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
+  __macro(cudnnMakeFusedOpsPlan);
+CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+}  // namespace dynload
+}  // namespace pten
+#endif
--- a/paddle/pten/backends/dynload/cufft.cc
+++ b/paddle/pten/backends/dynload/cufft.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/cufft.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace pten {
+namespace dynload {
+std::once_flag cufft_dso_flag;
+void* cufft_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+bool HasCUFFT() {
+  std::call_once(cufft_dso_flag,
+                 []() { cufft_dso_handle = GetCUFFTDsoHandle(); });
+  return cufft_dso_handle != nullptr;
+}
+void EnforceCUFFTLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      cufft_dso_handle,
+      paddle::platform::errors::PreconditionNotMet(
+          "Cannot load cufft shared library. Cannot invoke method %s.",
+          fn_name));
+}
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cufft.h
+++ b/paddle/pten/backends/dynload/cufft.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cufft.h>
+#include <cufftXt.h>
+#include <glog/logging.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag cufft_dso_flag;
+extern void* cufft_dso_handle;
+extern bool HasCUFFT();
+extern void EnforceCUFFTLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cufft_func = decltype(&::__name);                        \
+      std::call_once(cufft_dso_flag, []() {                          \
+        cufft_dso_handle = pten::dynload::GetCUFFTDsoHandle();       \
+      });                                                            \
+      EnforceCUFFTLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(cufft_dso_handle, #__name);    \
+      return reinterpret_cast<cufft_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+/**
+ * include all needed cufft functions in HPPL
+ * different cufft version has different interfaces
+ **/
+#define CUFFT_FFT_ROUTINE_EACH(__macro)  \
+  __macro(cufftPlan1d);                  \
+  __macro(cufftPlan2d);                  \
+  __macro(cufftPlan3d);                  \
+  __macro(cufftPlanMany);                \
+  __macro(cufftMakePlan1d);              \
+  __macro(cufftMakePlan2d);              \
+  __macro(cufftMakePlan3d);              \
+  __macro(cufftMakePlanMany);            \
+  __macro(cufftMakePlanMany64);          \
+  __macro(cufftGetSizeMany64);           \
+  __macro(cufftEstimate1d);              \
+  __macro(cufftEstimate2d);              \
+  __macro(cufftEstimate3d);              \
+  __macro(cufftEstimateMany);            \
+  __macro(cufftCreate);                  \
+  __macro(cufftGetSize1d);               \
+  __macro(cufftGetSize2d);               \
+  __macro(cufftGetSize3d);               \
+  __macro(cufftGetSizeMany);             \
+  __macro(cufftGetSize);                 \
+  __macro(cufftSetWorkArea);             \
+  __macro(cufftSetAutoAllocation);       \
+  __macro(cufftExecC2C);                 \
+  __macro(cufftExecR2C);                 \
+  __macro(cufftExecC2R);                 \
+  __macro(cufftExecZ2Z);                 \
+  __macro(cufftExecD2Z);                 \
+  __macro(cufftExecZ2D);                 \
+  __macro(cufftSetStream);               \
+  __macro(cufftDestroy);                 \
+  __macro(cufftGetVersion);              \
+  __macro(cufftGetProperty);             \
+  __macro(cufftXtSetGPUs);               \
+  __macro(cufftXtMalloc);                \
+  __macro(cufftXtMemcpy);                \
+  __macro(cufftXtFree);                  \
+  __macro(cufftXtSetWorkArea);           \
+  __macro(cufftXtExecDescriptorC2C);     \
+  __macro(cufftXtExecDescriptorR2C);     \
+  __macro(cufftXtExecDescriptorC2R);     \
+  __macro(cufftXtExecDescriptorZ2Z);     \
+  __macro(cufftXtExecDescriptorD2Z);     \
+  __macro(cufftXtExecDescriptorZ2D);     \
+  __macro(cufftXtQueryPlan);             \
+  __macro(cufftXtSetCallback);           \
+  __macro(cufftXtClearCallback);         \
+  __macro(cufftXtSetCallbackSharedSize); \
+  __macro(cufftXtMakePlanMany);          \
+  __macro(cufftXtGetSizeMany);           \
+  __macro(cufftXtExec);                  \
+  __macro(cufftXtExecDescriptor);        \
+  __macro(cufftXtSetWorkAreaPolicy);
+CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
+}  // namespace dynload
+}  // namespace pten
+#endif
--- a/paddle/pten/backends/dynload/cupti.cc
+++ b/paddle/pten/backends/dynload/cupti.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_CUPTI
+#include "paddle/pten/backends/dynload/cupti.h"
+namespace pten {
+namespace dynload {
+std::once_flag cupti_dso_flag;
+void *cupti_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CUPTI_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace pten
+#endif  // PADDLE_WITH_CUPTI
--- a/paddle/pten/backends/dynload/cupti.h
+++ b/paddle/pten/backends/dynload/cupti.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_CUPTI
+#include <cuda.h>
+#include <cupti.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag cupti_dso_flag;
+extern void *cupti_dso_handle;
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                   \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {        \
+      using cuptiFunc = decltype(&::__name);                      \
+      std::call_once(cupti_dso_flag, []() {                       \
+        cupti_dso_handle = pten::dynload::GetCUPTIDsoHandle();    \
+      });                                                         \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);    \
+    }                                                             \
+  };                                                              \
+  extern DynLoad__##__name __name
+#define CUPTI_ROUTINE_EACH(__macro)           \
+  __macro(cuptiActivityEnable);               \
+  __macro(cuptiActivityDisable);              \
+  __macro(cuptiActivityRegisterCallbacks);    \
+  __macro(cuptiActivityGetAttribute);         \
+  __macro(cuptiActivitySetAttribute);         \
+  __macro(cuptiGetTimestamp);                 \
+  __macro(cuptiActivityGetNextRecord);        \
+  __macro(cuptiGetResultString);              \
+  __macro(cuptiActivityGetNumDroppedRecords); \
+  __macro(cuptiActivityFlushAll);             \
+  __macro(cuptiSubscribe);                    \
+  __macro(cuptiUnsubscribe);                  \
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);
+CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
+#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
+}  // namespace dynload
+}  // namespace pten
+#endif  // PADDLE_WITH_CUPTI
--- a/paddle/pten/backends/dynload/cupti_lib_path.h.in
+++ b/paddle/pten/backends/dynload/cupti_lib_path.h.in
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
--- a/paddle/pten/backends/dynload/curand.cc
+++ b/paddle/pten/backends/dynload/curand.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/curand.h"
+namespace pten {
+namespace dynload {
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/curand.h
+++ b/paddle/pten/backends/dynload/curand.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <curand.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                   \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(Args... args) {                      \
+      using curandFunc = decltype(&::__name);                      \
+      std::call_once(curand_dso_flag, []() {                       \
+        curand_dso_handle = pten::dynload::GetCurandDsoHandle();   \
+      });                                                          \
+      static void *p_##__name = dlsym(curand_dso_handle, #__name); \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandGenerateNormal);               \
+  __macro(curandDestroyGenerator);
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cusolver.cc
+++ b/paddle/pten/backends/dynload/cusolver.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/cusolver.h"
+namespace pten {
+namespace dynload {
+std::once_flag cusolver_dso_flag;
+void *cusolver_dso_handle;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
+#ifdef CUSOLVER_ROUTINE_EACH_R1
+CUSOLVER_ROUTINE_EACH_R1(DEFINE_WRAP);
+#endif
+#ifdef CUSOLVER_ROUTINE_EACH_R2
+CUSOLVER_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cusolver.h
+++ b/paddle/pten/backends/dynload/cusolver.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cuda.h>
+#include <cusolverDn.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag cusolver_dso_flag;
+extern void *cusolver_dso_handle;
+#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    cusolverStatus_t operator()(Args... args) {                      \
+      using cusolverFunc = decltype(&::__name);                      \
+      std::call_once(cusolver_dso_flag, []() {                       \
+        cusolver_dso_handle = pten::dynload::GetCusolverDsoHandle(); \
+      });                                                            \
+      static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
+      return reinterpret_cast<cusolverFunc>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+#define CUSOLVER_ROUTINE_EACH(__macro)  \
+  __macro(cusolverDnCreate);            \
+  __macro(cusolverDnDestroy);           \
+  __macro(cusolverDnSetStream);         \
+  __macro(cusolverDnSpotrf_bufferSize); \
+  __macro(cusolverDnDpotrf_bufferSize); \
+  __macro(cusolverDnSpotrf);            \
+  __macro(cusolverDnDpotrf);            \
+  __macro(cusolverDnSpotrs);            \
+  __macro(cusolverDnDpotrs);            \
+  __macro(cusolverDnCpotrs);            \
+  __macro(cusolverDnZpotrs);            \
+  __macro(cusolverDnSsyevd_bufferSize); \
+  __macro(cusolverDnDsyevd_bufferSize); \
+  __macro(cusolverDnCheevd_bufferSize); \
+  __macro(cusolverDnZheevd_bufferSize); \
+  __macro(cusolverDnSsyevd);            \
+  __macro(cusolverDnDsyevd);            \
+  __macro(cusolverDnCheevd);            \
+  __macro(cusolverDnZheevd);
+CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
+#if CUDA_VERSION >= 9020
+#define CUSOLVER_ROUTINE_EACH_R1(__macro) \
+  __macro(cusolverDnSpotrfBatched);       \
+  __macro(cusolverDnDpotrfBatched);       \
+  __macro(cusolverDnSpotrsBatched);       \
+  __macro(cusolverDnDpotrsBatched);       \
+  __macro(cusolverDnSgesvdj_bufferSize);  \
+  __macro(cusolverDnSgetrf_bufferSize);   \
+  __macro(cusolverDnDgetrf_bufferSize);   \
+  __macro(cusolverDnCgetrf_bufferSize);   \
+  __macro(cusolverDnZgetrf_bufferSize);   \
+  __macro(cusolverDnSgeqrf_bufferSize);   \
+  __macro(cusolverDnDgeqrf_bufferSize);   \
+  __macro(cusolverDnCgeqrf_bufferSize);   \
+  __macro(cusolverDnZgeqrf_bufferSize);   \
+  __macro(cusolverDnSorgqr_bufferSize);   \
+  __macro(cusolverDnDorgqr_bufferSize);   \
+  __macro(cusolverDnSormqr_bufferSize);   \
+  __macro(cusolverDnDormqr_bufferSize);   \
+  __macro(cusolverDnCungqr_bufferSize);   \
+  __macro(cusolverDnZungqr_bufferSize);   \
+  __macro(cusolverDnDestroyGesvdjInfo);   \
+  __macro(cusolverDnCreateGesvdjInfo);    \
+  __macro(cusolverDnDgesvdj_bufferSize);  \
+  __macro(cusolverDnSgesvdj);             \
+  __macro(cusolverDnDgesvdj);             \
+  __macro(cusolverDnSgetrf);              \
+  __macro(cusolverDnDgetrf);              \
+  __macro(cusolverDnCgetrf);              \
+  __macro(cusolverDnZgetrf);              \
+  __macro(cusolverDnSgeqrf);              \
+  __macro(cusolverDnDgeqrf);              \
+  __macro(cusolverDnCgeqrf);              \
+  __macro(cusolverDnZgeqrf);              \
+  __macro(cusolverDnSorgqr);              \
+  __macro(cusolverDnDorgqr);              \
+  __macro(cusolverDnSormqr);              \
+  __macro(cusolverDnDormqr);              \
+  __macro(cusolverDnCungqr);              \
+  __macro(cusolverDnZungqr);
+CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+#endif
+#if CUDA_VERSION >= 9020
+#define CUSOLVER_ROUTINE_EACH_R2(__macro) \
+  __macro(cusolverDnCreateSyevjInfo);     \
+  __macro(cusolverDnSsyevj_bufferSize);   \
+  __macro(cusolverDnDsyevj_bufferSize);   \
+  __macro(cusolverDnSsyevj);              \
+  __macro(cusolverDnDsyevj);              \
+  __macro(cusolverDnDestroySyevjInfo);
+CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+#endif
+#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cusparse.cc
+++ b/paddle/pten/backends/dynload/cusparse.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/cusparse.h"
+namespace pten {
+namespace dynload {
+std::once_flag cusparse_dso_flag;
+void *cusparse_dso_handle;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+#ifdef CUSPARSE_ROUTINE_EACH
+CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+#endif
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+#ifdef CUSPARSE_ROUTINE_EACH_11020
+CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cusparse.h
+++ b/paddle/pten/backends/dynload/cusparse.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cuda.h>
+#include <cusparse.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag cusparse_dso_flag;
+extern void *cusparse_dso_handle;
+#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    cusparseStatus_t operator()(Args... args) {                      \
+      using cusparseFunc = decltype(&::__name);                      \
+      std::call_once(cusparse_dso_flag, []() {                       \
+        cusparse_dso_handle = pten::dynload::GetCusparseDsoHandle(); \
+      });                                                            \
+      static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
+      return reinterpret_cast<cusparseFunc>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+#define CUSPARSE_ROUTINE_EACH(__macro) \
+  __macro(cusparseCreate);             \
+  __macro(cusparseSetStream);          \
+  __macro(cusparseCreateMatDescr);     \
+  __macro(cusparseDestroy);            \
+  __macro(cusparseSnnz);               \
+  __macro(cusparseDnnz);               \
+  __macro(cusparseSetMatType);         \
+  __macro(cusparseSetMatIndexBase);
+CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
+// APIs available after CUDA 11.2
+#if CUDA_VERSION >= 11020
+#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
+  __macro(cusparseCreateCsr);                \
+  __macro(cusparseCreateCoo);                \
+  __macro(cusparseCreateDnMat);              \
+  __macro(cusparseSpMM_bufferSize);          \
+  __macro(cusparseSpMM);                     \
+  __macro(cusparseDestroySpMat);             \
+  __macro(cusparseDestroyDnMat);             \
+  __macro(cusparseCooSetPointers);           \
+  __macro(cusparseCsrSetPointers);           \
+  __macro(cusparseDenseToSparse_bufferSize); \
+  __macro(cusparseDenseToSparse_analysis);   \
+  __macro(cusparseDenseToSparse_convert);    \
+  __macro(cusparseSparseToDense_bufferSize); \
+  __macro(cusparseSparseToDense);
+CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+// APIs available after CUDA 11.3
+#if CUDA_VERSION >= 11030
+#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
+  __macro(cusparseSDDMM_bufferSize);      \
+  __macro(cusparseSDDMM_preprocess);      \
+  __macro(cusparseSDDMM);
+CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+#endif
+#endif
+#endif
+#endif
+#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/dynamic_loader.cc
+++ b/paddle/pten/backends/dynload/dynamic_loader.cc
--- a/paddle/pten/backends/dynload/dynamic_loader.h
+++ b/paddle/pten/backends/dynload/dynamic_loader.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+namespace pten {
+namespace dynload {
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
+void* GetCublasDsoHandle();
+void* GetCublasLtDsoHandle();
+void* GetCUDNNDsoHandle();
+void* GetCUPTIDsoHandle();
+void* GetCurandDsoHandle();
+void* GetNvjpegDsoHandle();
+void* GetCusolverDsoHandle();
+void* GetCusparseDsoHandle();
+void* GetNVRTCDsoHandle();
+void* GetCUDADsoHandle();
+void* GetWarpCTCDsoHandle();
+void* GetNCCLDsoHandle();
+void* GetHCCLDsoHandle();
+void* GetTensorRtDsoHandle();
+void* GetMKLMLDsoHandle();
+void* GetLAPACKDsoHandle();
+void* GetOpDsoHandle(const std::string& dso_name);
+void* GetNvtxDsoHandle();
+void* GetCUFFTDsoHandle();
+void* GetMKLRTDsoHandle();
+void* GetROCFFTDsoHandle();
+void SetPaddleLibPath(const std::string&);
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hipfft.cc
+++ b/paddle/pten/backends/dynload/hipfft.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/hipfft.h"
+namespace pten {
+namespace dynload {
+std::once_flag hipfft_dso_flag;
+void *hipfft_dso_handle;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hipfft.h
+++ b/paddle/pten/backends/dynload/hipfft.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_HIP
+#include <hipfft.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag hipfft_dso_flag;
+extern void *hipfft_dso_handle;
+#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)                     \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using hipfftFunc = decltype(&::__name);                        \
+      std::call_once(hipfft_dso_flag, []() {                         \
+        hipfft_dso_handle = pten::dynload::GetROCFFTDsoHandle();     \
+      });                                                            \
+      static void *p_##__name = dlsym(hipfft_dso_handle, #__name);   \
+      return reinterpret_cast<hipfftFunc>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
+  __macro(hipfftPlan1d);                 \
+  __macro(hipfftPlan2d);                 \
+  __macro(hipfftPlan3d);                 \
+  __macro(hipfftPlanMany);               \
+  __macro(hipfftMakePlan1d);             \
+  __macro(hipfftMakePlanMany);           \
+  __macro(hipfftMakePlanMany64);         \
+  __macro(hipfftGetSizeMany64);          \
+  __macro(hipfftEstimate1d);             \
+  __macro(hipfftEstimate2d);             \
+  __macro(hipfftEstimate3d);             \
+  __macro(hipfftEstimateMany);           \
+  __macro(hipfftCreate);                 \
+  __macro(hipfftGetSize1d);              \
+  __macro(hipfftGetSizeMany);            \
+  __macro(hipfftGetSize);                \
+  __macro(hipfftSetWorkArea);            \
+  __macro(hipfftSetAutoAllocation);      \
+  __macro(hipfftExecC2C);                \
+  __macro(hipfftExecR2C);                \
+  __macro(hipfftExecC2R);                \
+  __macro(hipfftExecZ2Z);                \
+  __macro(hipfftExecD2Z);                \
+  __macro(hipfftExecZ2D);                \
+  __macro(hipfftSetStream);              \
+  __macro(hipfftDestroy);                \
+  __macro(hipfftGetVersion);             \
+  __macro(hipfftGetProperty);
+HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
+inline const char *hipfftGetErrorString(hipfftResult_t status) {
+  switch (status) {
+    case HIPFFT_SUCCESS:
+      return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
+    case HIPFFT_INVALID_PLAN:
+      return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
+    case HIPFFT_ALLOC_FAILED:
+      return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
+             "memory.";
+    case HIPFFT_INVALID_TYPE:
+      return "'HIPFFT_INVALID_TYPE'. No longer used.";
+    case HIPFFT_INVALID_VALUE:
+      return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
+             "parameter.";
+    case HIPFFT_INTERNAL_ERROR:
+      return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
+             "error.";
+    case HIPFFT_EXEC_FAILED:
+      return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
+    case HIPFFT_SETUP_FAILED:
+      return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
+    case HIPFFT_INVALID_SIZE:
+      return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
+    case HIPFFT_UNALIGNED_DATA:
+      return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
+    case HIPFFT_INCOMPLETE_PARAMETER_LIST:
+      return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
+    case HIPFFT_INVALID_DEVICE:
+      return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
+             "GPU than plan creation.";
+    case HIPFFT_PARSE_ERROR:
+      return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
+    case HIPFFT_NO_WORKSPACE:
+      return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
+             "plan execution.";
+    case HIPFFT_NOT_IMPLEMENTED:
+      return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
+             "functionality for parameters given.";
+    case HIPFFT_NOT_SUPPORTED:
+      return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
+             "parameters given.";
+    default:
+      return "HIPFFT_STATUS_UNKNOWN_ERROR";
+  }
+}
+}  // namespace dynload
+}  // namespace pten
+#endif
--- a/paddle/pten/backends/dynload/hiprand.cc
+++ b/paddle/pten/backends/dynload/hiprand.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/hiprand.h"
+namespace pten {
+namespace dynload {
+std::once_flag hiprand_dso_flag;
+void *hiprand_dso_handle;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hiprand.h
+++ b/paddle/pten/backends/dynload/hiprand.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <hiprand.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/port.h"
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag hiprand_dso_flag;
+extern void *hiprand_dso_handle;
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    hiprandStatus_t operator()(Args... args) {                      \
+      using hiprandFunc = decltype(&::__name);                      \
+      std::call_once(hiprand_dso_flag, []() {                       \
+        hiprand_dso_handle = pten::dynload::GetCurandDsoHandle();   \
+      });                                                           \
+      static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \
+      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);    \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+#define HIPRAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(hiprandCreateGenerator);              \
+  __macro(hiprandSetStream);                    \
+  __macro(hiprandSetPseudoRandomGeneratorSeed); \
+  __macro(hiprandGenerateUniform);              \
+  __macro(hiprandGenerateUniformDouble);        \
+  __macro(hiprandGenerateNormal);               \
+  __macro(hiprandDestroyGenerator);
+HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hiprtc.cc
+++ b/paddle/pten/backends/dynload/hiprtc.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/hiprtc.h"
+namespace pten {
+namespace dynload {
+std::once_flag hiprtc_dso_flag;
+void* hiprtc_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
+bool HasNVRTC() {
+  std::call_once(hiprtc_dso_flag,
+                 []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
+  return hiprtc_dso_handle != nullptr;
+}
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hiprtc.h
+++ b/paddle/pten/backends/dynload/hiprtc.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <hip/hiprtc.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+namespace pten {
+namespace dynload {
+extern std::once_flag hiprtc_dso_flag;
+extern void* hiprtc_dso_handle;
+extern bool HasNVRTC();
+#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                     \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using hiprtc_func = decltype(&::__name);                       \
+      std::call_once(hiprtc_dso_flag, []() {                         \
+        hiprtc_dso_handle = pten::dynload::GetNVRTCDsoHandle();      \
+      });                                                            \
+      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);   \
+      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);     \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+/**
+ * include all needed hiprtc functions
+ **/
+#define HIPRTC_ROUTINE_EACH(__macro) \
+  __macro(hiprtcVersion);            \
+  __macro(hiprtcGetErrorString);     \
+  __macro(hiprtcCompileProgram);     \
+  __macro(hiprtcCreateProgram);      \
+  __macro(hiprtcDestroyProgram);     \
+  __macro(hiprtcGetCode);            \
+  __macro(hiprtcGetCodeSize);        \
+  __macro(hiprtcGetProgramLog);      \
+  __macro(hiprtcGetProgramLogSize)
+HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
+#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/lapack.cc
+++ b/paddle/pten/backends/dynload/lapack.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/lapack.h"
+#include <mutex>
+namespace pten {
+namespace dynload {
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+LAPACK_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/lapack.h
+++ b/paddle/pten/backends/dynload/lapack.h
--- a/paddle/pten/backends/dynload/miopen.cc
+++ b/paddle/pten/backends/dynload/miopen.cc
--- a/paddle/pten/backends/dynload/miopen.h
+++ b/paddle/pten/backends/dynload/miopen.h
--- a/paddle/pten/backends/dynload/mklml.cc
+++ b/paddle/pten/backends/dynload/mklml.cc
--- a/paddle/pten/backends/dynload/mklml.h
+++ b/paddle/pten/backends/dynload/mklml.h
--- a/paddle/pten/backends/dynload/mklrt.cc
+++ b/paddle/pten/backends/dynload/mklrt.cc
--- a/paddle/pten/backends/dynload/mklrt.h
+++ b/paddle/pten/backends/dynload/mklrt.h
--- a/paddle/pten/backends/dynload/nccl.cc
+++ b/paddle/pten/backends/dynload/nccl.cc
--- a/paddle/pten/backends/dynload/nccl.h
+++ b/paddle/pten/backends/dynload/nccl.h
--- a/paddle/pten/backends/dynload/nvjpeg.cc
+++ b/paddle/pten/backends/dynload/nvjpeg.cc
--- a/paddle/pten/backends/dynload/nvjpeg.h
+++ b/paddle/pten/backends/dynload/nvjpeg.h
--- a/paddle/pten/backends/dynload/nvrtc.cc
+++ b/paddle/pten/backends/dynload/nvrtc.cc
--- a/paddle/pten/backends/dynload/nvrtc.h
+++ b/paddle/pten/backends/dynload/nvrtc.h
--- a/paddle/pten/backends/dynload/nvtx.cc
+++ b/paddle/pten/backends/dynload/nvtx.cc
--- a/paddle/pten/backends/dynload/nvtx.h
+++ b/paddle/pten/backends/dynload/nvtx.h
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
--- a/paddle/pten/backends/dynload/rccl.cc
+++ b/paddle/pten/backends/dynload/rccl.cc
--- a/paddle/pten/backends/dynload/rccl.h
+++ b/paddle/pten/backends/dynload/rccl.h
--- a/paddle/pten/backends/dynload/rocblas.cc
+++ b/paddle/pten/backends/dynload/rocblas.cc
--- a/paddle/pten/backends/dynload/rocblas.h
+++ b/paddle/pten/backends/dynload/rocblas.h
--- a/paddle/pten/backends/dynload/rocm_driver.cc
+++ b/paddle/pten/backends/dynload/rocm_driver.cc
--- a/paddle/pten/backends/dynload/rocm_driver.h
+++ b/paddle/pten/backends/dynload/rocm_driver.h
--- a/paddle/pten/backends/dynload/tensorrt.cc
+++ b/paddle/pten/backends/dynload/tensorrt.cc
--- a/paddle/pten/backends/dynload/tensorrt.h
+++ b/paddle/pten/backends/dynload/tensorrt.h
--- a/paddle/pten/backends/dynload/warpctc.cc
+++ b/paddle/pten/backends/dynload/warpctc.cc
--- a/paddle/pten/backends/dynload/warpctc.h
+++ b/paddle/pten/backends/dynload/warpctc.h