[PTEN] Move dynload from fluid to pten. (#39120)

* move dynload from fluid to pten. * fix ci compile * fix windows ci compile. * update * update * fix compile error

[PTEN] Move dynload from fluid to pten. (#39120)
* move dynload from fluid to pten. * fix ci compile * fix windows ci compile. * update * update * fix compile error
3c1dc6f6 · Wilber · GitHub · 11938ae1 · 3c1dc6f6 · 3c1dc6f6
123 changed file
--- a/paddle/fluid/distributed/service/brpc_utils.h
+++ b/paddle/fluid/distributed/service/brpc_utils.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace butil {
 class IOBuf;
@@ -78,11 +78,11 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
                                        const framework::Scope* scope);

 void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
-                          butil::IOBufBytesIterator& iobuf,
+                          butil::IOBufBytesIterator& iobuf,  // NOLINT
                          const platform::DeviceContext& ctx);

 void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
-                             butil::IOBufBytesIterator& iobuf,
+                             butil::IOBufBytesIterator& iobuf,  // NOLINT
                             const platform::DeviceContext& ctx);

 std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port);

--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -40,9 +40,9 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace paddle {
 namespace distributed {
@@ -202,7 +202,7 @@ class ValueBlock {
      // value = _alloc.acquire(value_length_);
      table[id] = value;
    } else {
-      value = (VALUE *)(void *)(res->second);
+      value = (VALUE *)(void *)(res->second);  // NOLINT
    }
    return value;
  }
@@ -282,8 +282,8 @@ class ValueBlock {
        value->unseen_days_++;
        if (value->unseen_days_ >= threshold) {
          butil::return_object(iter->second);
-          //_alloc.release(iter->second);
-          //_alloc.release(value);
+          // _alloc.release(iter->second);
+          // _alloc.release(value);
          iter = table.erase(iter);
        } else {
          ++iter;

--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -38,8 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/timer.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "paddle/fluid/framework/io/shell.h"

 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@@ -34,8 +34,8 @@
 #include <utility>
 #include <vector>

-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/backends/dynload/port.h"

 #if defined(__arm__) || defined(__aarch64__) || defined(__ARM_NEON) || \
    defined(__ARM_NEON__)

--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_desc.pb.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 #ifdef _WIN32
 #include <direct.h>

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -20,7 +20,7 @@
 #include <vector>

 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -31,8 +31,8 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/backends/dynload/port.h"

 extern std::string paddle::framework::DataTypeToString(
    const framework::proto::VarType::Type type);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/errors.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/backends/dynload/port.h"

 DECLARE_bool(use_mkldnn);


--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 DECLARE_bool(use_mkldnn);


--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/platform/device/npu/dynload/hccl.h
+++ b/paddle/fluid/platform/device/npu/dynload/hccl.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT

 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 #define HCOM_GROUP_PREFIX "HCOM_GROUP_"


--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader)

 list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)

@@ -34,24 +34,24 @@ if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
 elseif (WITH_ASCEND_CL)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc)
 else()
-  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc)
 endif()
 if (WITH_MKLML)
-    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml)
 endif()

-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader)
+cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack)
 add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?

 if (MKL_FOUND AND WITH_ONEMKL)
  message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader)
+  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt)
  target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;

 #define DEFINE_WRAP(__name) DynLoad__##__name __name


--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -20,16 +20,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cublas.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag cublas_dso_flag;
-extern void *cublas_dso_handle;
-
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load cublas routine
@@ -37,19 +33,8 @@ extern void *cublas_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
-      using cublas_func =                                                    \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);          \
-      std::call_once(cublas_dso_flag, []() {                                 \
-        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
-      return reinterpret_cast<cublas_func>(p_##__name)(args...);             \
-    }                                                                        \
-  };                                                                         \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
@@ -99,7 +84,7 @@ extern void *cublas_dso_handle;
  __macro(cublasSgetrsBatched);           \
  __macro(cublasDgetrsBatched);

-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)

 // APIs available after CUDA 8.0
 #if CUDA_VERSION >= 8000
@@ -111,7 +96,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
  __macro(cublasZgemmStridedBatched);        \
  __macro(cublasHgemmStridedBatched);

-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif

 // APIs available after CUDA 9.0
@@ -120,7 +105,7 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
  __macro(cublasSetMathMode);                \
  __macro(cublasGetMathMode);

-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif

 // APIs available after CUDA 9.1
@@ -129,10 +114,10 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
  __macro(cublasGemmBatchedEx);              \
  __macro(cublasGemmStridedBatchedEx);

-CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif

-#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cublasLt.cc
+++ b/paddle/fluid/platform/dynload/cublasLt.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cublasLt_dso_flag;
-void *cublasLt_dso_handle = nullptr;

 #define DEFINE_WRAP(__name) DynLoad__##__name __name


--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cublasLt.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag cublasLt_dso_flag;
-extern void *cublasLt_dso_handle;
-
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load cublasLt routine
@@ -36,20 +32,8 @@ extern void *cublasLt_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cublasLt_func =                                                 \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
-      std::call_once(cublasLt_dso_flag, []() {                              \
-        cublasLt_dso_handle =                                               \
-            paddle::platform::dynload::GetCublasLtDsoHandle();              \
-      });                                                                   \
-      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
-      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
-    }                                                                       \
-  };                                                                        \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)   \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 // APIs available after CUDA 10.1
@@ -69,10 +53,10 @@ extern void *cublasLt_dso_handle;
  __macro(cublasLtMatrixTransformDescDestroy); \
  __macro(cublasLtMatrixTransformDescSetAttribute);

-CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif

-#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ b/paddle/fluid/platform/dynload/cuda_driver.cc
@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/dynload/cuda_driver.h"
+#include "paddle/pten/backends/dynload/cuda_driver.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag cuda_dso_flag;
-void* cuda_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 #if CUDA_VERSION >= 10020
@@ -28,10 +26,7 @@ CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
 #endif
 CUDA_ROUTINE_EACH(DEFINE_WRAP);

-bool HasCUDADriver() {
-  std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
-  return cuda_dso_handle != nullptr;
-}
+bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <cuda.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cuda_driver.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag cuda_dso_flag;
-extern void* cuda_dso_handle;
 extern bool HasCUDADriver();

-#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
-      using cuda_func = decltype(&::__name);                             \
-      std::call_once(cuda_dso_flag, []() {                               \
-        cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(cuda_dso_handle, #__name);         \
-      return reinterpret_cast<cuda_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name

 /**
 * include all needed cuda driver functions
@@ -72,12 +59,12 @@ extern bool HasCUDADriver();
  __macro(cuMemRelease);                  \
  __macro(cuMemAddressFree)

-CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+CUDA_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
 #endif

-CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+CUDA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP);

-#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUDA_WRAP

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/dynload/cudnn.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cudnn.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;

 #define DEFINE_WRAP(__name) DynLoad__##__name __name

@@ -45,19 +43,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
 #endif

-bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag,
-                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
-  return cudnn_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      cudnn_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load cudnn shared library. Cannot invoke method %s.",
-          fn_name));
-}
+bool HasCUDNN() { return pten::dynload::HasCUDNN(); }

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -18,32 +18,17 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cudnn.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag cudnn_dso_flag;
-extern void* cudnn_dso_handle;
 extern bool HasCUDNN();

-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using cudnn_func = decltype(&::__name);                              \
-      std::call_once(cudnn_dso_flag, []() {                                \
-        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
-      });                                                                  \
-      EnforceCUDNNLoaded(#__name);                                         \
-      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);          \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)      \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name

 /**
 * include all needed cudnn functions in HPPL
@@ -127,7 +112,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnGetActivationDescriptor);                   \
  __macro(cudnnDestroyActivationDescriptor);               \
  __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)

 #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
@@ -135,7 +120,8 @@ CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnGetConvolutionForwardAlgorithm);          \
  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
  __macro(cudnnSetRNNDescriptor);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(
+    PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

 #if CUDNN_VERSION >= 7001
@@ -153,7 +139,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
-CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

 #if CUDNN_VERSION >= 7201
@@ -166,7 +152,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnRNNBackwardDataEx);                   \
  __macro(cudnnRNNBackwardWeightsEx);                \
  __macro(cudnnRNNForwardInferenceEx);
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

 #if CUDNN_VERSION >= 7401
@@ -176,7 +162,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
  __macro(cudnnBatchNormalizationBackwardEx);                        \
  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

 #if CUDNN_VERSION >= 8000
@@ -192,7 +178,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
  __macro(cudnnMakeFusedOpsPlan);
-CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R8(PLATFORM_DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/cufft.cc
+++ b/paddle/fluid/platform/dynload/cufft.cc
@@ -13,31 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/dynload/cufft.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cufft.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cufft_dso_flag;
-void* cufft_dso_handle = nullptr;

 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);

-bool HasCUFFT() {
-  std::call_once(cufft_dso_flag,
-                 []() { cufft_dso_handle = GetCUFFTDsoHandle(); });
-  return cufft_dso_handle != nullptr;
-}
-
-void EnforceCUFFTLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      cufft_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load cufft shared library. Cannot invoke method %s.",
-          fn_name));
-}
+bool HasCUFFT() { return pten::dynload::HasCUFFT(); }

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cufft.h
+++ b/paddle/fluid/platform/dynload/cufft.h
@@ -19,32 +19,17 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cufft.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag cufft_dso_flag;
-extern void* cufft_dso_handle;
 extern bool HasCUFFT();

-extern void EnforceCUFFTLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using cufft_func = decltype(&::__name);                              \
-      std::call_once(cufft_dso_flag, []() {                                \
-        cufft_dso_handle = paddle::platform::dynload::GetCUFFTDsoHandle(); \
-      });                                                                  \
-      EnforceCUFFTLoaded(#__name);                                         \
-      static void* p_##__name = dlsym(cufft_dso_handle, #__name);          \
-      return reinterpret_cast<cufft_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)      \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name

 /**
 * include all needed cufft functions in HPPL
@@ -104,7 +89,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
  __macro(cufftXtExecDescriptor);        \
  __macro(cufftXtSetWorkAreaPolicy);

-CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
+CUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
@@ -20,9 +20,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag cupti_dso_flag;
-void *cupti_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 CUPTI_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <cupti.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cupti.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag cupti_dso_flag;
-extern void *cupti_dso_handle;
-
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load cupti routine
@@ -36,18 +32,8 @@ extern void *cupti_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
-      using cuptiFunc = decltype(&::__name);                               \
-      std::call_once(cupti_dso_flag, []() {                                \
-        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
-      });                                                                  \
-      static void *p_##__name = dlsym(cupti_dso_handle, #__name);          \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)               \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define CUPTI_ROUTINE_EACH(__macro)           \

--- a/paddle/fluid/platform/dynload/curand.cc
+++ b/paddle/fluid/platform/dynload/curand.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag curand_dso_flag;
-void *curand_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -16,27 +16,14 @@ limitations under the License. */
 #include <curand.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/curand.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag curand_dso_flag;
-extern void *curand_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    curandStatus_t operator()(Args... args) {                                \
-      using curandFunc = decltype(&::__name);                                \
-      std::call_once(curand_dso_flag, []() {                                 \
-        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define CURAND_RAND_ROUTINE_EACH(__macro)      \
@@ -48,7 +35,7 @@ extern void *curand_dso_handle;
  __macro(curandGenerateNormal);               \
  __macro(curandDestroyGenerator);

-CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+CURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/cusolver.cc
+++ b/paddle/fluid/platform/dynload/cusolver.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag cusolver_dso_flag;
-void *cusolver_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -17,28 +17,14 @@ limitations under the License. */
 #include <cusolverDn.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cusolver.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cusolver_dso_flag;
-extern void *cusolver_dso_handle;

-#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)                   \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    cusolverStatus_t operator()(Args... args) {                      \
-      using cusolverFunc = decltype(&::__name);                      \
-      std::call_once(cusolver_dso_flag, []() {                       \
-        cusolver_dso_handle =                                        \
-            paddle::platform::dynload::GetCusolverDsoHandle();       \
-      });                                                            \
-      static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
-      return reinterpret_cast<cusolverFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)   \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define CUSOLVER_ROUTINE_EACH(__macro)  \
@@ -62,7 +48,7 @@ extern void *cusolver_dso_handle;
  __macro(cusolverDnCheevd);            \
  __macro(cusolverDnZheevd);

-CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
+CUSOLVER_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);

 #if CUDA_VERSION >= 9020
 #define CUSOLVER_ROUTINE_EACH_R1(__macro) \
@@ -105,7 +91,7 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
  __macro(cusolverDnCungqr);              \
  __macro(cusolverDnZungqr);

-CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif

 #if CUDA_VERSION >= 9020
@@ -117,10 +103,10 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
  __macro(cusolverDnDsyevj);              \
  __macro(cusolverDnDestroySyevjInfo);

-CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif

-#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cusparse.cc
+++ b/paddle/fluid/platform/dynload/cusparse.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag cusparse_dso_flag;
-void *cusparse_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 #ifdef CUSPARSE_ROUTINE_EACH

--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -17,28 +17,14 @@ limitations under the License. */
 #include <cusparse.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/cusparse.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag cusparse_dso_flag;
-extern void *cusparse_dso_handle;

-#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                   \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    cusparseStatus_t operator()(Args... args) {                      \
-      using cusparseFunc = decltype(&::__name);                      \
-      std::call_once(cusparse_dso_flag, []() {                       \
-        cusparse_dso_handle =                                        \
-            paddle::platform::dynload::GetCusparseDsoHandle();       \
-      });                                                            \
-      static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
-      return reinterpret_cast<cusparseFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)   \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #if defined(PADDLE_WITH_CUDA)
@@ -54,7 +40,7 @@ extern void *cusparse_dso_handle;
  __macro(cusparseSetMatType);         \
  __macro(cusparseSetMatIndexBase);

-CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
+CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);

 // APIs available after CUDA 11.2
 #if CUDA_VERSION >= 11020
@@ -74,7 +60,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
  __macro(cusparseSparseToDense_bufferSize); \
  __macro(cusparseSparseToDense);

-CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)

 // APIs available after CUDA 11.3
 #if CUDA_VERSION >= 11030
@@ -83,13 +69,13 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
  __macro(cusparseSDDMM_preprocess);      \
  __macro(cusparseSDDMM);

-CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif
 #endif
 #endif
 #endif

-#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -15,556 +15,61 @@ limitations under the License. */

 #include <string>
 #include <vector>
-
 #include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#if defined(_WIN32)
-#include <windows.h>
-#endif
-
-DEFINE_string(cudnn_dir, "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
-
-DEFINE_string(
-    cuda_dir, "",
-    "Specify path for loading cuda library, such as libcublas, libcublasLt "
-    "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
-    "If default, dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(nccl_dir, "",
-              "Specify path for loading nccl library, such as libnccl.so. "
-              "For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(hccl_dir, "",
-              "Specify path for loading hccl library, such as libhccl.so. "
-              "For instance, "
-              "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
-              "default, "
-              "dlopen will search hccl from LD_LIBRARY_PATH");
-
-DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
-
-DEFINE_string(
-    tensorrt_dir, "",
-    "Specify path for loading tensorrt library, such as libnvinfer.so.");
-
-DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
-
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
-DEFINE_string(mkl_dir, "",
-              "Specify path for loading libmkl_rt.so. "
-              "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/."
-              "If default, "
-              "dlopen will search mkl from LD_LIBRARY_PATH");
-
-DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
-
-#ifdef PADDLE_WITH_HIP
-
-DEFINE_string(miopen_dir, "",
-              "Specify path for loading libMIOpen.so. For instance, "
-              "/opt/rocm/miopen/lib. If empty [default], dlopen "
-              "will search miopen from LD_LIBRARY_PATH");
-
-DEFINE_string(rocm_dir, "",
-              "Specify path for loading rocm library, such as librocblas, "
-              "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
-              "If default, dlopen will search rocm from LD_LIBRARY_PATH");
-
-DEFINE_string(rccl_dir, "",
-              "Specify path for loading rccl library, such as librccl.so. "
-              "For instance, /opt/rocm/rccl/lib. If default, "
-              "dlopen will search rccl from LD_LIBRARY_PATH");
-#endif
+#include "paddle/pten/backends/dynload/dynamic_loader.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-struct PathNode {
-  PathNode() {}
-  std::string path = "";
-};
-
-static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
-
-// NOTE: In order to adapt to the default installation path of cuda
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
-#else
-static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";
-#endif
-
-static PathNode s_py_site_pkg_path;
-
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
-static constexpr char* win_cublas_lib =
-    "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
-#if CUDA_VERSION >= 11000
-static constexpr char* win_curand_lib =
-    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
-static constexpr char* win_nvjpeg_lib =
-    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
-static constexpr char* win_cusolver_lib =
-    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
-static constexpr char* win_cusparse_lib =
-    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
-static constexpr char* win_cufft_lib =
-    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll";
-#else
-static constexpr char* win_curand_lib =
-    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_nvjpeg_lib =
-    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cusolver_lib =
-    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cusparse_lib =
-    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cufft_lib =
-    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
-#endif  // CUDA_VERSION
-#endif
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-  // directory separator
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline std::vector<std::string> split(
-    const std::string& str, const std::string separator = " ") {
-  std::vector<std::string> str_list;
-  std::string::size_type firstPos;
-  firstPos = str.find_first_not_of(separator, 0);
-  std::string::size_type lastPos;
-  lastPos = str.find_first_of(separator, firstPos);
-  while (std::string::npos != firstPos && std::string::npos != lastPos) {
-    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
-    firstPos = str.find_first_not_of(separator, lastPos);
-    lastPos = str.find_first_of(separator, firstPos);
-  }
-  if (std::string::npos == lastPos) {
-    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
-  }
-  return str_list;
-}
-
 void SetPaddleLibPath(const std::string& py_site_pkg_path) {
-  s_py_site_pkg_path.path = py_site_pkg_path;
-  VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
-}
-
-static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
-                                                 const std::string& dso_name,
-                                                 int dynload_flags) {
-  void* dso_handle = nullptr;
-  if (!spec_path.empty()) {
-    // search xxx.so from custom path
-    VLOG(3) << "Try to find library: " << dso_name
-            << " from specific path: " << spec_path;
-    std::string dso_path = join(spec_path, dso_name);
-    dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-  }
-  return dso_handle;
-}
-
-static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
-                                                int dynload_flags) {
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  // and /usr/local/lib path
-  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-
-// TODO(chenweihang): This path is used to search which libs?
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == dso_handle) {
-    dso_handle =
-        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
-  }
-#endif
-
-  return dso_handle;
+  pten::dynload::SetPaddleLibPath(py_site_pkg_path);
 }

-/*
- * We define three priorities for dynamic library search:
- *
- * First: Search for the path specified by the user
- * Second: Search the system default path
- * Third: Search for a special path corresponding to
- *        a specific library to adapt to changes and easy to expand.
- */
-
-static inline void* GetDsoHandleFromSearchPath(
-    const std::string& config_path, const std::string& dso_name,
-    bool throw_on_error = true,
-    const std::vector<std::string>& extra_paths = std::vector<std::string>(),
-    const std::string& warning_msg = std::string()) {
-#if !defined(_WIN32)
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-#else
-  int dynload_flags = 0;
-#endif  // !_WIN32
-  std::vector<std::string> dso_names = split(dso_name, ";");
-  void* dso_handle = nullptr;
-  for (auto dso : dso_names) {
-    // 1. search in user config path by FLAGS
-    dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
-    // 2. search in system default path
-    if (nullptr == dso_handle) {
-      dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
-    }
-    // 3. search in extra paths
-    if (nullptr == dso_handle) {
-      for (auto path : extra_paths) {
-        VLOG(3) << "extra_paths: " << path;
-        dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
-      }
-    }
-    if (nullptr != dso_handle) break;
-  }
+void* GetCublasDsoHandle() { return pten::dynload::GetCublasDsoHandle(); }

-  // 4. [If Failed for All dso_names] logging warning if exists
-  if (nullptr == dso_handle && !warning_msg.empty()) {
-    LOG(WARNING) << warning_msg;
-  }
+void* GetCublasLtDsoHandle() { return pten::dynload::GetCublasLtDsoHandle(); }

-  // 5. [If Failed for All dso_names] logging or throw error info
-  if (nullptr == dso_handle) {
-    auto error_msg =
-        "The third-party dynamic library (%s) that Paddle depends on is not "
-        "configured correctly. (error code is %s)\n"
-        "  Suggestions:\n"
-        "  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
-        "is installed correctly and its version is matched with paddlepaddle "
-        "you installed.\n"
-        "  2. Configure third-party dynamic library environment variables as "
-        "follows:\n"
-        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
-        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
-        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
-        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
-        "impossible unless System Integrity Protection (SIP) is disabled.]";
-#if !defined(_WIN32)
-    auto errorno = dlerror();
-#else
-    auto errorno = GetLastError();
-#endif  // !_WIN32
-    if (throw_on_error) {
-      // NOTE: Special error report case, no need to change its format
-      PADDLE_THROW(
-          platform::errors::PreconditionNotMet(error_msg, dso_name, errorno));
-    } else {
-      LOG(WARNING) << string::Sprintf(error_msg, dso_name, errorno);
-    }
-  }
+void* GetCUDNNDsoHandle() { return pten::dynload::GetCUDNNDsoHandle(); }

-  return dso_handle;
-}
+void* GetCUPTIDsoHandle() { return pten::dynload::GetCUPTIDsoHandle(); }

-void* GetCublasDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true,
-                                    {cuda_lib_path});
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
-#endif
-}
-
-void* GetCublasLtDsoHandle() {
-// APIs available after CUDA 10.1
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
-#else
-  std::string warning_msg(
-      "Your CUDA_VERSION less 10.1, not support CublasLt. "
-      "If you want to use CublasLt, please upgrade CUDA and rebuild "
-      "PaddlePaddle.");
-  return nullptr;
-#endif
-}
-
-void* GetCUDNNDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  std::string mac_warn_meg(
-      "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-      "For instance, sudo tar -xzf "
-      "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-      "chmod a+r /usr/local/cuda/include/cudnn.h "
-      "/usr/local/cuda/lib/libcudnn*");
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false,
-                                    {}, mac_warn_meg);
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  std::string win_warn_meg(
-      "Note: [Recommend] copy cudnn into CUDA installation directory. \n "
-      "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
-      "NVIDIA's official website, \n"
-      "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
-      "Toolkit\\CUDA\\v10.0\n"
-      "You should do this according to your CUDA installation directory and "
-      "CUDNN version.");
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true,
-                                    {cuda_lib_path}, win_warn_meg);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
-                                    {cuda_lib_path});
-#endif
-}
-
-void* GetCUPTIDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false,
-                                    {cupti_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.so", false,
-                                    {cupti_lib_path});
-#endif
-}
-
-void* GetCurandDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true,
-                                    {cuda_lib_path});
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
-#endif
-}
+void* GetCurandDsoHandle() { return pten::dynload::GetCurandDsoHandle(); }

 #ifdef PADDLE_WITH_HIP
-void* GetROCFFTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so");
-#endif
-}
+void* GetROCFFTDsoHandle() { return pten::dynload::GetROCFFTDsoHandle(); }
 #endif

-void* GetNvjpegDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_nvjpeg_lib, true,
-                                    {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
-#endif
-}
+void* GetNvjpegDsoHandle() { return pten::dynload::GetNvjpegDsoHandle(); }

-void* GetCusolverDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusolver_lib, true,
-                                    {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
-#endif
-}
+void* GetCusolverDsoHandle() { return pten::dynload::GetCusolverDsoHandle(); }

-void* GetCusparseDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusparse_lib, true,
-                                    {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
-#endif
-}
+void* GetCusparseDsoHandle() { return pten::dynload::GetCusparseDsoHandle(); }

-void* GetNVRTCDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
-#endif
-}
+void* GetNVRTCDsoHandle() { return pten::dynload::GetNVRTCDsoHandle(); }

-void* GetCUDADsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#elif defined(_WIN32)
-  char system32_dir[MAX_PATH];
-  GetSystemDirectory(system32_dir, MAX_PATH);
-  return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
-#endif
-}
+void* GetCUDADsoHandle() { return pten::dynload::GetCUDADsoHandle(); }

-void* GetWarpCTCDsoHandle() {
-  std::string warpctc_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    warpctc_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
-#else
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
-#endif
-}
+void* GetWarpCTCDsoHandle() { return pten::dynload::GetWarpCTCDsoHandle(); }

-void* GetNCCLDsoHandle() {
-#ifdef PADDLE_WITH_HIP
-  std::string warning_msg(
-      "You may need to install 'rccl' from ROCM official website: "
-      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
-      "Installation-Guide.html before install PaddlePaddle.");
-#else
-  std::string warning_msg(
-      "You may need to install 'nccl2' from NVIDIA official website: "
-      "https://developer.nvidia.com/nccl/nccl-download"
-      "before install PaddlePaddle.");
-#endif
+void* GetNCCLDsoHandle() { return pten::dynload::GetNCCLDsoHandle(); }
+void* GetHCCLDsoHandle() { return pten::dynload::GetHCCLDsoHandle(); }

-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
-                                    warning_msg);
-#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true, {},
-                                    warning_msg);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
-                                    warning_msg);
-#endif
-}
-void* GetHCCLDsoHandle() {
-  std::string warning_msg(
-      "You may need to install 'hccl2' from Huawei official website: "
-      "before install PaddlePaddle.");
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
-                                    warning_msg);
-#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+void* GetTensorRtDsoHandle() { return pten::dynload::GetTensorRtDsoHandle(); }

-#elif defined(PADDLE_WITH_ASCEND_CL)
-  return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
-                                    warning_msg);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
-                                    warning_msg);
-#endif
-}
+void* GetMKLMLDsoHandle() { return pten::dynload::GetMKLMLDsoHandle(); }

-void* GetTensorRtDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
-#endif
-}
-
-void* GetMKLMLDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
-#endif
-}
-
-void* GetLAPACKDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
-#endif
-}
+void* GetLAPACKDsoHandle() { return pten::dynload::GetLAPACKDsoHandle(); }

 void* GetOpDsoHandle(const std::string& dso_name) {
-  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
+  return pten::dynload::GetOpDsoHandle(dso_name);
 }

-void* GetNvtxDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple."));
-#elif defined(_WIN32)
-  PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows."));
-#elif !defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(
-      platform::errors::Unimplemented("Nvtx do not support without CUDA."));
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
-#endif
-}
+void* GetNvtxDsoHandle() { return pten::dynload::GetNvtxDsoHandle(); }

-void* GetCUFFTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cufft_lib, true,
-                                    {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
-#endif
-}
+void* GetCUFFTDsoHandle() { return pten::dynload::GetCUFFTDsoHandle(); }

-void* GetMKLRTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
-#endif
-}
+void* GetMKLRTDsoHandle() { return pten::dynload::GetMKLRTDsoHandle(); }

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/hipfft.cc
+++ b/paddle/fluid/platform/dynload/hipfft.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag hipfft_dso_flag;
-void *hipfft_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/hipfft.h
+++ b/paddle/fluid/platform/dynload/hipfft.h
@@ -17,8 +17,7 @@ limitations under the License. */

 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/hipfft.h"

 namespace paddle {
 namespace platform {
@@ -26,18 +25,8 @@ namespace dynload {
 extern std::once_flag hipfft_dso_flag;
 extern void *hipfft_dso_handle;

-#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
-      using hipfftFunc = decltype(&::__name);                                \
-      std::call_once(hipfft_dso_flag, []() {                                 \
-        hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(hipfft_dso_handle, #__name);           \
-      return reinterpret_cast<hipfftFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define HIPFFT_FFT_ROUTINE_EACH(__macro) \
@@ -70,53 +59,8 @@ extern void *hipfft_dso_handle;
  __macro(hipfftGetVersion);             \
  __macro(hipfftGetProperty);

-HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
+HIPFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);

-inline const char *hipfftGetErrorString(hipfftResult_t status) {
-  switch (status) {
-    case HIPFFT_SUCCESS:
-      return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
-    case HIPFFT_INVALID_PLAN:
-      return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
-    case HIPFFT_ALLOC_FAILED:
-      return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
-             "memory.";
-    case HIPFFT_INVALID_TYPE:
-      return "'HIPFFT_INVALID_TYPE'. No longer used.";
-    case HIPFFT_INVALID_VALUE:
-      return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
-             "parameter.";
-    case HIPFFT_INTERNAL_ERROR:
-      return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
-             "error.";
-    case HIPFFT_EXEC_FAILED:
-      return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
-    case HIPFFT_SETUP_FAILED:
-      return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
-    case HIPFFT_INVALID_SIZE:
-      return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
-    case HIPFFT_UNALIGNED_DATA:
-      return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
-    case HIPFFT_INCOMPLETE_PARAMETER_LIST:
-      return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
-    case HIPFFT_INVALID_DEVICE:
-      return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
-             "GPU than plan creation.";
-    case HIPFFT_PARSE_ERROR:
-      return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
-    case HIPFFT_NO_WORKSPACE:
-      return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
-             "plan execution.";
-    case HIPFFT_NOT_IMPLEMENTED:
-      return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
-             "functionality for parameters given.";
-    case HIPFFT_NOT_SUPPORTED:
-      return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
-             "parameters given.";
-    default:
-      return "HIPFFT_STATUS_UNKNOWN_ERROR";
-  }
-}
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/platform/dynload/hiprand.cc
+++ b/paddle/fluid/platform/dynload/hiprand.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag hiprand_dso_flag;
-void *hiprand_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/hiprand.h
+++ b/paddle/fluid/platform/dynload/hiprand.h
@@ -16,28 +16,15 @@ limitations under the License. */
 #include <hiprand.h>

 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/hiprand.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag hiprand_dso_flag;
-extern void *hiprand_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                              \
-  struct DynLoad__##__name {                                                  \
-    template <typename... Args>                                               \
-    hiprandStatus_t operator()(Args... args) {                                \
-      using hiprandFunc = decltype(&::__name);                                \
-      std::call_once(hiprand_dso_flag, []() {                                 \
-        hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-      });                                                                     \
-      static void *p_##__name = dlsym(hiprand_dso_handle, #__name);           \
-      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);              \
-    }                                                                         \
-  };                                                                          \
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define HIPRAND_RAND_ROUTINE_EACH(__macro)      \
@@ -49,7 +36,7 @@ extern void *hiprand_dso_handle;
  __macro(hiprandGenerateNormal);               \
  __macro(hiprandDestroyGenerator);

-HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+HIPRAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/hiprtc.cc
+++ b/paddle/fluid/platform/dynload/hiprtc.cc
@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/dynload/hiprtc.h"
+#include "paddle/pten/backends/dynload/hiprtc.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag hiprtc_dso_flag;
-void* hiprtc_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 HIPRTC_ROUTINE_EACH(DEFINE_WRAP);

-bool HasNVRTC() {
-  std::call_once(hiprtc_dso_flag,
-                 []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
-  return hiprtc_dso_handle != nullptr;
-}
+bool HasNVRTC() { return pten::dynload::HasNVRTC(); }

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/hiprtc.h
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -16,30 +16,17 @@ limitations under the License. */

 #include <hip/hiprtc.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/hiprtc.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag hiprtc_dso_flag;
-extern void* hiprtc_dso_handle;
 extern bool HasNVRTC();

-#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      using hiprtc_func = decltype(&::__name);                              \
-      std::call_once(hiprtc_dso_flag, []() {                                \
-        hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
-      });                                                                   \
-      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);          \
-      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name

 /**
 * include all needed hiprtc functions
@@ -55,9 +42,9 @@ extern bool HasNVRTC();
  __macro(hiprtcGetProgramLog);      \
  __macro(hiprtcGetProgramLogSize)

-HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
+HIPRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);

-#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/lapack.cc
+++ b/paddle/fluid/platform/dynload/lapack.cc
@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/dynload/lapack.h"
-#include <mutex>

 namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag lapack_dso_flag;
-void* lapack_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 LAPACK_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/lapack.h
+++ b/paddle/fluid/platform/dynload/lapack.h
@@ -16,122 +16,20 @@ limitations under the License. */

 #include <complex>
 #include <mutex>
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-// Note(zhouwei): because lapack doesn't provide appropriate header file.
-// should expose API statement yourself.
-
-// getrf_(For example)
-extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
-                        int *info);
-extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
-                        int *info);
-
-// evd
-extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
-                        int *lda, double *w, std::complex<double> *work,
-                        int *lwork, double *rwork, int *lrwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
-                        int *lda, float *w, std::complex<float> *work,
-                        int *lwork, float *rwork, int *lrwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
-                        double *w, double *work, int *lwork, int *iwork,
-                        int *liwork, int *info);
-extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
-                        float *w, float *work, int *lwork, int *iwork,
-                        int *liwork, int *info);
-
-// geev
-extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
-                       double *wr, double *wi, double *vl, int *ldvl,
-                       double *vr, int *ldvr, double *work, int *lwork,
-                       int *info);
-extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
-                       float *wr, float *wi, float *vl, int *ldvl, float *vr,
-                       int *ldvr, float *work, int *lwork, int *info);
-extern "C" void zgeev_(char *jobvl, char *jobvr, int *n,
-                       std::complex<double> *a, int *lda,
-                       std::complex<double> *w, std::complex<double> *vl,
-                       int *ldvl, std::complex<double> *vr, int *ldvr,
-                       std::complex<double> *work, int *lwork, double *rwork,
-                       int *info);
-extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
-                       int *lda, std::complex<float> *w,
-                       std::complex<float> *vl, int *ldvl,
-                       std::complex<float> *vr, int *ldvr,
-                       std::complex<float> *work, int *lwork, float *rwork,
-                       int *info);
-
-// gels
-extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs, double *a,
-                       int *lda, double *b, int *ldb, double *work, int *lwork,
-                       int *info);
-extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs, float *a,
-                       int *lda, float *b, int *ldb, float *work, int *lwork,
-                       int *info);
-
-// gelsd
-extern "C" void dgelsd_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, double *s, double *rcond,
-                        int *rank, double *work, int *lwork, int *iwork,
-                        int *info);
-extern "C" void sgelsd_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, float *s, float *rcond, int *rank,
-                        float *work, int *lwork, int *iwork, int *info);
-
-// gelsy
-extern "C" void dgelsy_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, int *jpvt, double *rcond,
-                        int *rank, double *work, int *lwork, int *info);
-extern "C" void sgelsy_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, int *jpvt, float *rcond, int *rank,
-                        float *work, int *lwork, int *info);
-
-// gelss
-extern "C" void dgelss_(int *m, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, double *s, double *rcond,
-                        int *rank, double *work, int *lwork, int *info);
-extern "C" void sgelss_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
-                        int *ldb, float *s, float *rcond, int *rank,
-                        float *work, int *lwork, int *info);
-
-extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a,
-                        int *lda, std::complex<double> *b, int *ldb, int *info);
-extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a,
-                        int *lda, std::complex<float> *b, int *ldb, int *info);
-extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda,
-                        double *b, int *ldb, int *info);
-extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda,
-                        float *b, int *ldb, int *info);
+#include "paddle/pten/backends/dynload/lapack.h"
+#include "paddle/pten/common/complex.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag lapack_dso_flag;
-extern void *lapack_dso_handle;
-
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load lapack routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                     \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
-      using lapackFunc = decltype(&::__name);                                \
-      std::call_once(lapack_dso_flag, []() {                                 \
-        lapack_dso_handle = paddle::platform::dynload::GetLAPACKDsoHandle(); \
-      });                                                                    \
-      static void *p_##_name = dlsym(lapack_dso_handle, #__name);            \
-      return reinterpret_cast<lapackFunc>(p_##_name)(args...);               \
-    }                                                                        \
-  };                                                                         \
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                      \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \

--- a/paddle/fluid/platform/dynload/miopen.cc
+++ b/paddle/fluid/platform/dynload/miopen.cc
@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/dynload/miopen.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cudnn.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag miopen_dso_flag;
-void* miopen_dso_handle = nullptr;

 #define DEFINE_WRAP(__name) DynLoad__##__name __name

@@ -50,19 +48,7 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif

-bool HasCUDNN() {
-  std::call_once(miopen_dso_flag,
-                 []() { miopen_dso_handle = GetCUDNNDsoHandle(); });
-  return miopen_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      miopen_dso_handle,
-      platform::errors::PreconditionNotMet(
-          "Cannot load miopen shared library. Cannot invoke method %s.",
-          fn_name));
-}
+bool HasCUDNN() { return pten::dynload::HasCUDNN(); }

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -18,66 +18,17 @@ limitations under the License. */
 #include <miopen/miopen.h>
 #include <miopen/version.h>
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-#define MIOPEN_VERSION                                       \
-  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
-   MIOPEN_VERSION_PATCH)  // NOLINT
-
-// MIOPEN only support NCHW, just for compatibility with CUDNN API
-typedef enum {
-  MIOPEN_TENSOR_NCHW = 0,
-  MIOPEN_TENSOR_NHWC = 1,
-} miopenTensorFormat_t;
+#include "paddle/pten/backends/dynload/miopen.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag miopen_dso_flag;
-extern void* miopen_dso_handle;
 extern bool HasCUDNN();

-inline const char* miopenGetErrorString(miopenStatus_t status) {
-  switch (status) {
-    case miopenStatusSuccess:
-      return "MIOPEN_STATUS_SUCCESS";
-    case miopenStatusNotInitialized:
-      return "MIOPEN_STATUS_NOT_INITIALIZED";
-    case miopenStatusInvalidValue:
-      return "MIOPEN_STATUS_INVALID_VALUE";
-    case miopenStatusBadParm:
-      return "MIOPEN_STATUS_BAD_PARAM";
-    case miopenStatusAllocFailed:
-      return "MIOPEN_STATUS_ALLOC_FAILED";
-    case miopenStatusInternalError:
-      return "MIOPEN_STATUS_INTERNAL_ERROR";
-    case miopenStatusNotImplemented:
-      return "MIOPEN_STATUS_NOT_IMPLEMENTED";
-    case miopenStatusUnsupportedOp:
-      return "MIOPEN_STATUS_UNSUPPORTED_OP";
-    case miopenStatusUnknownError:
-    default:
-      return "MIOPEN_STATUS_UNKNOWN_ERROR";
-  }
-}
-
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)                            \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      using miopen_func = decltype(&::__name);                              \
-      std::call_once(miopen_dso_flag, []() {                                \
-        miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
-      });                                                                   \
-      EnforceCUDNNLoaded(#__name);                                          \
-      static void* p_##__name = dlsym(miopen_dso_handle, #__name);          \
-      return reinterpret_cast<miopen_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name

 /**
 * include all needed miopen functions in HPPL
@@ -145,23 +96,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(miopenRNNForwardInference);                     \
  __macro(miopenGetTensorNumBytes);

-MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)

 #define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
  __macro(miopenConvolutionBackwardData);
-MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)

 // APIs available after R3:
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
  __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)

 // APIs available after R4:
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
  __macro(miopenBatchNormalizationForwardTraining);  \
  __macro(miopenBatchNormalizationForwardInference); \
  __macro(miopenBatchNormalizationBackward);
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)

 // APIs in R5
 #define MIOPEN_DNN_ROUTINE_EACH_R5(__macro)  \
@@ -169,12 +120,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
  __macro(miopenSetActivationDescriptor);    \
  __macro(miopenGetActivationDescriptor);    \
  __macro(miopenDestroyActivationDescriptor);
-MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R5(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)

 // APIs in R6
 #define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
 /*__macro(miopenSetRNNDescriptor_v6);*/
-MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R6(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)

 #define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
  __macro(miopenSetConvolutionGroupCount);  \
@@ -184,7 +135,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
  __macro(miopenSetCTCLossDescriptor);      \
  __macro(miopenGetCTCLossWorkspaceSize);   \
  __macro(miopenCTCLoss);
-MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)

 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                    \
 /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
@@ -192,7 +143,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx);                   \
 __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);          \
 __macro(cudnnBatchNormalizationBackwardEx);                          \
 __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(PLATFORM_DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/mklml.cc
+++ b/paddle/fluid/platform/dynload/mklml.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag mklml_dso_flag;
-void* mklml_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 MKLML_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -17,36 +17,23 @@ limitations under the License. */
 #include <mkl.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/mklml.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag mklml_dso_flag;
-extern void *mklml_dso_handle;
-
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load mklml routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using mklmlFunc = decltype(&::__name);                               \
-      std::call_once(mklml_dso_flag, []() {                                \
-        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
-      });                                                                  \
-      static void *p_##_name = dlsym(mklml_dso_handle, #__name);           \
-      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
-    }                                                                      \
-  };                                                                       \
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

-#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
+  DYNAMIC_LOAD_MKLML_WRAP(__name)

 #define MKLML_ROUTINE_EACH(__macro) \
  __macro(cblas_sgemm);             \
@@ -111,7 +98,7 @@ extern void *mklml_dso_handle;
  __macro(MKL_Set_Num_Threads);     \
  __macro(MKL_Get_Max_Threads);

-MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);

 #if !defined(_WIN32)
 DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);

--- a/paddle/fluid/platform/dynload/mklrt.h
+++ b/paddle/fluid/platform/dynload/mklrt.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT

 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace paddle {
 namespace platform {
@@ -32,18 +32,8 @@ extern void* mklrt_dso_handle;
 * (for each function) to dynamic load mkldfti routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                                    \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using mklrtFunc = decltype(&::__name);                               \
-      std::call_once(mklrt_dso_flag, []() {                                \
-        mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \
-      });                                                                  \
-      static void* p_##__name = dlsym(mklrt_dso_handle, #__name);          \
-      return reinterpret_cast<mklrtFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
+#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 // mkl_dfti.h has a macro that shadows the function with the same name

--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag nccl_dso_flag;
-void *nccl_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -16,28 +16,14 @@ limitations under the License. */
 #include <nccl.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/nccl.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag nccl_dso_flag;
-extern void* nccl_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(&::__name);                             \
-      std::call_once(nccl_dso_flag, []() {                               \
-        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define NCCL_RAND_ROUTINE_EACH(__macro) \
@@ -57,30 +43,30 @@ extern void* nccl_dso_handle;
  __macro(ncclReduceScatter);           \
  __macro(ncclGetErrorString);

-NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)

 #if NCCL_VERSION_CODE >= 2212
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
-NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif

 #if NCCL_VERSION_CODE >= 2304
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
-NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif

 #if NCCL_VERSION_CODE >= 2703
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
  __macro(ncclSend);                               \
  __macro(ncclRecv);
-NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif

 #if NCCL_VERSION_CODE >= 21100
 #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
  __macro(ncclRedOpCreatePreMulSum);                \
  __macro(ncclRedOpDestroy);
-NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif

 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/nvjpeg.cc
+++ b/paddle/fluid/platform/dynload/nvjpeg.cc
@@ -15,9 +15,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag nvjpeg_dso_flag;
-void *nvjpeg_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/nvjpeg.h
+++ b/paddle/fluid/platform/dynload/nvjpeg.h
@@ -14,27 +14,14 @@ limitations under the License. */
 #include <nvjpeg.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/nvjpeg.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag nvjpeg_dso_flag;
-extern void *nvjpeg_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    nvjpegStatus_t operator()(Args... args) {                                \
-      using nvjpegFunc = decltype(&::__name);                                \
-      std::call_once(nvjpeg_dso_flag, []() {                                 \
-        nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name);           \
-      return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define NVJPEG_RAND_ROUTINE_EACH(__macro) \
@@ -44,7 +31,7 @@ extern void *nvjpeg_dso_handle;
  __macro(nvjpegJpegStateDestroy);        \
  __macro(nvjpegDecode);

-NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
+NVJPEG_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ b/paddle/fluid/platform/dynload/nvrtc.cc
@@ -13,23 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/dynload/nvrtc.h"
+#include "paddle/pten/backends/dynload/nvrtc.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag nvrtc_dso_flag;
-void* nvrtc_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 NVRTC_ROUTINE_EACH(DEFINE_WRAP);

-bool HasNVRTC() {
-  std::call_once(nvrtc_dso_flag,
-                 []() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
-  return nvrtc_dso_handle != nullptr;
-}
+bool HasNVRTC() { return pten::dynload::HasNVRTC(); }

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <nvrtc.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/nvrtc.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag nvrtc_dso_flag;
-extern void* nvrtc_dso_handle;
 extern bool HasNVRTC();

-#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using nvrtc_func = decltype(&::__name);                              \
-      std::call_once(nvrtc_dso_flag, []() {                                \
-        nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
-      });                                                                  \
-      static void* p_##__name = dlsym(nvrtc_dso_handle, #__name);          \
-      return reinterpret_cast<nvrtc_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)      \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name

 /**
 * include all needed nvrtc functions
@@ -56,9 +43,9 @@ extern bool HasNVRTC();
  __macro(nvrtcGetProgramLog);      \
  __macro(nvrtcGetProgramLogSize)

-NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+NVRTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);

-#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/nvtx.cc
+++ b/paddle/fluid/platform/dynload/nvtx.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag nvtx_dso_flag;
-void *nvtx_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 NVTX_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/nvtx.h
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -17,36 +17,23 @@ limitations under the License. */
 #include <nvToolsExt.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/nvtx.h"

 namespace paddle {
 namespace platform {
 namespace dynload {
-extern std::once_flag nvtx_dso_flag;
-extern void *nvtx_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    int operator()(Args... args) {                                       \
-      using nvtxFunc = decltype(&::__name);                              \
-      std::call_once(nvtx_dso_flag, []() {                               \
-        nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
-      });                                                                \
-      static void *p_##__name = dlsym(nvtx_dso_handle, #__name);         \
-      return reinterpret_cast<nvtxFunc>(p_##__name)(args...);            \
-    }                                                                    \
-  };                                                                     \
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define NVTX_ROUTINE_EACH(__macro) \
  __macro(nvtxRangePushA);         \
  __macro(nvtxRangePop);

-NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);

-#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/platform/dynload/rccl.cc
+++ b/paddle/fluid/platform/dynload/rccl.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag rccl_dso_flag;
-void *rccl_dso_handle;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -16,28 +16,14 @@ limitations under the License. */
 #include <rccl.h>

 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/rccl.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag rccl_dso_flag;
-extern void* rccl_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(&::__name);                             \
-      std::call_once(rccl_dso_flag, []() {                               \
-        rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(rccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define RCCL_RAND_ROUTINE_EACH(__macro) \
@@ -57,18 +43,18 @@ extern void* rccl_dso_handle;
  __macro(ncclReduceScatter);           \
  __macro(ncclGetErrorString);

-RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)

 #if NCCL_VERSION_CODE >= 2212
 #define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
-RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif

 #if NCCL_VERSION_CODE >= 2703
 #define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
  __macro(ncclSend);                               \
  __macro(ncclRecv);
-RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif

 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/rocblas.cc
+++ b/paddle/fluid/platform/dynload/rocblas.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag rocblas_dso_flag;
-void *rocblas_dso_handle = nullptr;

 #define DEFINE_WRAP(__name) DynLoad__##__name __name


--- a/paddle/fluid/platform/dynload/rocblas.h
+++ b/paddle/fluid/platform/dynload/rocblas.h
@@ -19,16 +19,12 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <type_traits>

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/rocblas.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag rocblas_dso_flag;
-extern void *rocblas_dso_handle;
-
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load cublas routine
@@ -36,18 +32,8 @@ extern void *rocblas_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                  \
-    template <typename... Args>                                               \
-    rocblas_status operator()(Args... args) {                                 \
-      using rocblas_func = decltype(&::__name);                               \
-      std::call_once(rocblas_dso_flag, []() {                                 \
-        rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
-      });                                                                     \
-      static void *p_##__name = dlsym(rocblas_dso_handle, #__name);           \
-      return reinterpret_cast<rocblas_func>(p_##__name)(args...);             \
-    }                                                                         \
-  };                                                                          \
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)    \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
@@ -83,7 +69,7 @@ extern void *rocblas_dso_handle;
  __macro(rocblas_set_pointer_mode);       \
  __macro(rocblas_get_pointer_mode);

-ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)

 // APIs available after CUDA 8.0
 #define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
@@ -94,21 +80,21 @@ ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
  __macro(rocblas_zgemm_strided_batched);     \
  __macro(rocblas_hgemm_strided_batched);

-ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)

 // HIP not supported in ROCM3.5
 // #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
 //   __macro(cublasSetMathMode);
 //   __macro(cublasGetMathMode);
-// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+// ROCBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)

 #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
  __macro(rocblas_gemm_batched_ex);           \
  __macro(rocblas_gemm_strided_batched_ex);

-ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)

-#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/rocm_driver.cc
+++ b/paddle/fluid/platform/dynload/rocm_driver.cc
@@ -13,22 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/dynload/rocm_driver.h"
+#include "paddle/pten/backends/dynload/rocm_driver.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag rocm_dso_flag;
-void* rocm_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 ROCM_ROUTINE_EACH(DEFINE_WRAP);

-bool HasCUDADriver() {
-  std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
-  return rocm_dso_handle != nullptr;
-}
+bool HasCUDADriver() { return pten::dynload::HasCUDADriver(); }

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -17,30 +17,17 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/rocm_driver.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag rocm_dso_flag;
-extern void* rocm_dso_handle;
 extern bool HasCUDADriver();

-#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)                           \
-  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                          \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
-      using rocm_func = decltype(&::__name);                             \
-      std::call_once(rocm_dso_flag, []() {                               \
-        rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
-      });                                                                \
-      static void* p_##__name = dlsym(rocm_dso_handle, #__name);         \
-      return reinterpret_cast<rocm_func>(p_##__name)(args...);           \
-    }                                                                    \
-  };                                                                     \
-  extern struct DynLoad__##__name __name
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)       \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name

 /**
 * include all needed cuda driver functions
@@ -59,9 +46,9 @@ extern bool HasCUDADriver();
  __macro(hipGetDeviceCount);                                 \
  __macro(hipDevicePrimaryCtxGetState)

-ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
+ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);

-#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/warpctc.cc
+++ b/paddle/fluid/platform/dynload/warpctc.cc
@@ -18,9 +18,6 @@ namespace paddle {
 namespace platform {
 namespace dynload {

-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
-
 #define DEFINE_WRAP(__name) DynLoad__##__name __name

 WARPCTC_ROUTINE_EACH(DEFINE_WRAP);

--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -16,34 +16,19 @@ limitations under the License. */

 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-#include "warpctc/include/ctc.h"
+#include "paddle/pten/backends/dynload/warpctc.h"

 namespace paddle {
 namespace platform {
 namespace dynload {

-extern std::once_flag warpctc_dso_flag;
-extern void* warpctc_dso_handle;
-
 /**
 * The following macro definition can generate structs
 * (for each function) to dynamic load warpctc routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
-      using warpctcFunc = decltype(&::__name);                                 \
-      std::call_once(warpctc_dso_flag, []() {                                  \
-        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
-      });                                                                      \
-      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);             \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
-    }                                                                          \
-  };                                                                           \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                     \
+  using DynLoad__##__name = pten::dynload::DynLoad__##__name; \
  extern DynLoad__##__name __name

 #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -65,30 +65,30 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
+#include "paddle/pten/backends/dynload/port.h"

 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/dynload/cublas.h"
-#include "paddle/fluid/platform/dynload/cudnn.h"
-#include "paddle/fluid/platform/dynload/curand.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
+#include "paddle/pten/backends/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
+#include "paddle/pten/backends/dynload/curand.h"
+#include "paddle/pten/backends/dynload/cusolver.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include <error.h>
-#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/pten/backends/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA

 #ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/hipfft.h"
-#include "paddle/fluid/platform/dynload/hiprand.h"
-#include "paddle/fluid/platform/dynload/miopen.h"
-#include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/dynload/hipfft.h"
+#include "paddle/pten/backends/dynload/hiprand.h"
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include <error.h>  // NOLINT
-#include "paddle/fluid/platform/dynload/rccl.h"
+#include "paddle/pten/backends/dynload/rccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_HIP

@@ -880,7 +880,7 @@ inline bool is_error(cudnnStatus_t stat) {
 inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
  std::ostringstream sout;
  sout << "CUDNN error(" << stat << "), "
-       << platform::dynload::cudnnGetErrorString(stat) << ". "
+       << pten::dynload::cudnnGetErrorString(stat) << ". "
       << GetExternalErrorMsg(stat);
  return sout.str();
 }
@@ -945,7 +945,7 @@ inline bool is_error(ncclResult_t nccl_result) {
 inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
  std::ostringstream sout;
  sout << "NCCL error(" << nccl_result << "), "
-       << platform::dynload::ncclGetErrorString(nccl_result) << ". ";
+       << pten::dynload::ncclGetErrorString(nccl_result) << ". ";
  if (errno == ENOSPC || errno == EAGAIN) {
    std::string detail(strerror(errno));
    detail += "\nPlease try one of the following solutions:";
@@ -1090,7 +1090,7 @@ inline bool is_error(miopenStatus_t stat) {

 inline std::string build_rocm_error_msg(miopenStatus_t stat) {
  std::string msg(" Miopen error, ");
-  return msg + platform::dynload::miopenGetErrorString(stat) + " ";
+  return msg + pten::dynload::miopenGetErrorString(stat) + " ";
 }

 /***** ROCBLAS ERROR *****/
@@ -1132,7 +1132,7 @@ inline bool is_error(ncclResult_t nccl_result) {

 inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
  std::string msg(" Rccl error, ");
-  return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
+  return msg + pten::dynload::ncclGetErrorString(nccl_result) + " ";
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL

@@ -1141,7 +1141,7 @@ inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; }

 inline std::string build_rocm_error_msg(hipfftResult_t stat) {
  std::string msg(" HIPFFT error, ");
-  return msg + platform::dynload::hipfftGetErrorString(stat) + " ";
+  return msg + pten::dynload::hipfftGetErrorString(stat) + " ";
 }

 namespace details {

--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include <unistd.h>
 #elif defined(_MSC_VER)
 #include <processthreadsapi.h>
+#else
+#include <unistd.h>
 #endif
 #include "paddle/fluid/platform/macros.h"  // import DISABLE_COPY_AND_ASSIGN


--- a/paddle/fluid/platform/os_info.h
+++ b/paddle/fluid/platform/os_info.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #ifdef _POSIX_C_SOURCE
 #include <time.h>
 #endif
-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stdlib.h>

-#include "paddle/fluid/platform/port.h"
+#include "paddle/pten/backends/dynload/port.h"

 #ifdef _WIN32
 static unsigned sleep(unsigned seconds) {

--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
+add_subdirectory(dynload)
+
 add_subdirectory(cpu)
+
 cc_library(pten_context SRCS all_context.cc DEPS device_context)
--- a/paddle/pten/backends/dynload/CMakeLists.txt
+++ b/paddle/pten/backends/dynload/CMakeLists.txt
+cc_library(pten_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
+
+list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
+
+if (NOT WITH_NV_JETSON)
+  list(APPEND CUDA_SRCS nvjpeg.cc)
+endif()
+
+if (WITH_ROCM)
+  list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
+endif()
+
+# There is no macOS version of NCCL.
+# Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
+if (NOT APPLE)
+  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+  if (WITH_NCCL)
+    list(APPEND CUDA_SRCS nccl.cc)
+  endif()
+  if (WITH_ROCM)
+    list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
+    if (WITH_RCCL)
+      list(APPEND HIP_SRCS rccl.cc)
+    endif()
+  endif()
+endif()
+
+if (TENSORRT_FOUND)
+  list(APPEND CUDA_SRCS tensorrt.cc)
+endif()
+
+configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
+if (CUPTI_FOUND)
+  list(APPEND CUDA_SRCS cupti.cc)
+endif(CUPTI_FOUND)
+if(WITH_ROCM)
+  hip_library(pten_dynload_cuda SRCS ${HIP_SRCS} DEPS pten_dynamic_loader)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+elseif (WITH_ASCEND_CL)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc npu_hccl)
+else()
+  nv_library(pten_dynload_cuda SRCS ${CUDA_SRCS} DEPS pten_dynamic_loader)
+  cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc)
+endif()
+if (WITH_MKLML)
+  cc_library(pten_dynload_mklml SRCS mklml.cc DEPS pten_dynamic_loader mklml)
+endif()
+
+cc_library(pten_dynload_lapack SRCS lapack.cc DEPS pten_dynamic_loader)
+add_dependencies(pten_dynload_lapack extern_lapack)
+# TODO(TJ): add iomp, mkldnn?
+
+if (MKL_FOUND AND WITH_ONEMKL)
+  message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
+  cc_library(pten_dynload_mklrt SRCS mklrt.cc DEPS pten_dynamic_loader)
+  target_include_directories(pten_dynload_mklrt PRIVATE ${MKL_INCLUDE})
+endif()
--- a/paddle/pten/backends/dynload/cublas.cc
+++ b/paddle/pten/backends/dynload/cublas.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cublas.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
+CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
+CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cublas.h
+++ b/paddle/pten/backends/dynload/cublas.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublasXt.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublas_func =                                                   \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublas_dso_flag, []() {                                \
+        cublas_dso_handle = pten::dynload::GetCublasDsoHandle();            \
+      });                                                                   \
+      static void *p_##__name = dlsym(cublas_dso_handle, #__name);          \
+      return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSaxpy_v2);                \
+  __macro(cublasDaxpy_v2);                \
+  __macro(cublasCaxpy_v2);                \
+  __macro(cublasZaxpy_v2);                \
+  __macro(cublasSscal_v2);                \
+  __macro(cublasDscal_v2);                \
+  __macro(cublasScopy_v2);                \
+  __macro(cublasDcopy_v2);                \
+  __macro(cublasSgemv_v2);                \
+  __macro(cublasDgemv_v2);                \
+  __macro(cublasCgemv_v2);                \
+  __macro(cublasZgemv_v2);                \
+  __macro(cublasSgemm_v2);                \
+  __macro(cublasDgemm_v2);                \
+  __macro(cublasCgemm_v2);                \
+  __macro(cublasZgemm_v2);                \
+  __macro(cublasHgemm);                   \
+  __macro(cublasSgemmEx);                 \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
+  __macro(cublasStrsm_v2);                \
+  __macro(cublasDtrsm_v2);                \
+  __macro(cublasCtrsm_v2);                \
+  __macro(cublasZtrsm_v2);                \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasStrsmBatched);            \
+  __macro(cublasDtrsmBatched);            \
+  __macro(cublasCtrsmBatched);            \
+  __macro(cublasZtrsmBatched);            \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched);           \
+  __macro(cublasSmatinvBatched);          \
+  __macro(cublasDmatinvBatched);          \
+  __macro(cublasSgetrsBatched);           \
+  __macro(cublasDgetrsBatched);
+
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+// APIs available after CUDA 8.0
+#if CUDA_VERSION >= 8000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(cublasGemmEx);                     \
+  __macro(cublasSgemmStridedBatched);        \
+  __macro(cublasDgemmStridedBatched);        \
+  __macro(cublasCgemmStridedBatched);        \
+  __macro(cublasZgemmStridedBatched);        \
+  __macro(cublasHgemmStridedBatched);
+
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
+// APIs available after CUDA 9.0
+#if CUDA_VERSION >= 9000
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
+  __macro(cublasSetMathMode);                \
+  __macro(cublasGetMathMode);
+
+CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
+// APIs available after CUDA 9.1
+#if CUDA_VERSION >= 9010
+#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(cublasGemmBatchedEx);              \
+  __macro(cublasGemmStridedBatchedEx);
+
+CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cublasLt.cc
+++ b/paddle/pten/backends/dynload/cublasLt.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cublasLt.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag cublasLt_dso_flag;
+void *cublasLt_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cublasLt.h
+++ b/paddle/pten/backends/dynload/cublasLt.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublasLt.h>
+#include <cuda.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cublasLt_dso_flag;
+extern void *cublasLt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublasLt routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublasLt_func =                                                 \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublasLt_dso_flag, []() {                              \
+        cublasLt_dso_handle = pten::dynload::GetCublasLtDsoHandle();        \
+      });                                                                   \
+      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
+      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+// APIs available after CUDA 10.1
+// #if CUDA_VERSION >= 10100
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)    \
+  __macro(cublasLtCreate);                     \
+  __macro(cublasLtDestroy);                    \
+  __macro(cublasLtMatmul);                     \
+  __macro(cublasLtMatmulDescCreate);           \
+  __macro(cublasLtMatmulDescDestroy);          \
+  __macro(cublasLtMatmulDescSetAttribute);     \
+  __macro(cublasLtMatrixLayoutCreate);         \
+  __macro(cublasLtMatrixLayoutDestroy);        \
+  __macro(cublasLtMatrixLayoutSetAttribute);   \
+  __macro(cublasLtMatrixTransform);            \
+  __macro(cublasLtMatrixTransformDescCreate);  \
+  __macro(cublasLtMatrixTransformDescDestroy); \
+  __macro(cublasLtMatrixTransformDescSetAttribute);
+
+CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+// #endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cuda_driver.cc
+++ b/paddle/pten/backends/dynload/cuda_driver.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cuda_driver.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cuda_dso_flag;
+void* cuda_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+#if CUDA_VERSION >= 10020
+CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
+#endif
+CUDA_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() {
+  std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
+  return cuda_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cuda_driver.h
+++ b/paddle/pten/backends/dynload/cuda_driver.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cuda.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cuda_dso_flag;
+extern void* cuda_dso_handle;
+extern bool HasCUDADriver();
+
+#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)                       \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cuda_func = decltype(&::__name);                         \
+      std::call_once(cuda_dso_flag, []() {                           \
+        cuda_dso_handle = pten::dynload::GetCUDADsoHandle();         \
+      });                                                            \
+      static void* p_##__name = dlsym(cuda_dso_handle, #__name);     \
+      return reinterpret_cast<cuda_func>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cuda driver functions
+ **/
+#define CUDA_ROUTINE_EACH(__macro)                      \
+  __macro(cuInit);                                      \
+  __macro(cuDriverGetVersion);                          \
+  __macro(cuGetErrorString);                            \
+  __macro(cuModuleLoadData);                            \
+  __macro(cuModuleGetFunction);                         \
+  __macro(cuModuleUnload);                              \
+  __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(cuLaunchKernel);                              \
+  __macro(cuCtxCreate);                                 \
+  __macro(cuCtxGetCurrent);                             \
+  __macro(cuDeviceGetCount);                            \
+  __macro(cuDevicePrimaryCtxGetState);                  \
+  __macro(cuDeviceGetAttribute);                        \
+  __macro(cuDeviceGet)
+
+#if CUDA_VERSION >= 10020
+#define CUDA_ROUTINE_EACH_VVM(__macro)    \
+  __macro(cuMemGetAllocationGranularity); \
+  __macro(cuMemAddressReserve);           \
+  __macro(cuMemCreate);                   \
+  __macro(cuMemMap);                      \
+  __macro(cuMemSetAccess);                \
+  __macro(cuMemUnmap);                    \
+  __macro(cuMemRelease);                  \
+  __macro(cuMemAddressFree)
+
+CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+#endif
+
+CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cudnn.cc
+++ b/paddle/pten/backends/dynload/cudnn.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R8
+CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
+#endif
+
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag,
+                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
+  return cudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      cudnn_dso_handle,
+      paddle::platform::errors::PreconditionNotMet(
+          "Cannot load cudnn shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cudnn.h
+++ b/paddle/pten/backends/dynload/cudnn.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
+
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cudnn_func = decltype(&::__name);                        \
+      std::call_once(cudnn_dso_flag, []() {                          \
+        cudnn_dso_handle = pten::dynload::GetCUDNNDsoHandle();       \
+      });                                                            \
+      EnforceCUDNNLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);    \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
+  __macro(cudnnSetTensor4dDescriptor);                     \
+  __macro(cudnnSetTensor4dDescriptorEx);                   \
+  __macro(cudnnSetTensorNdDescriptor);                     \
+  __macro(cudnnGetTensorNdDescriptor);                     \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
+  __macro(cudnnCreateTensorDescriptor);                    \
+  __macro(cudnnDestroyTensorDescriptor);                   \
+  __macro(cudnnCreateFilterDescriptor);                    \
+  __macro(cudnnSetFilter4dDescriptor);                     \
+  __macro(cudnnSetFilterNdDescriptor);                     \
+  __macro(cudnnGetFilterNdDescriptor);                     \
+  __macro(cudnnSetPooling2dDescriptor);                    \
+  __macro(cudnnSetPoolingNdDescriptor);                    \
+  __macro(cudnnGetPoolingNdDescriptor);                    \
+  __macro(cudnnDestroyFilterDescriptor);                   \
+  __macro(cudnnCreateConvolutionDescriptor);               \
+  __macro(cudnnCreatePoolingDescriptor);                   \
+  __macro(cudnnDestroyPoolingDescriptor);                  \
+  __macro(cudnnSetConvolution2dDescriptor);                \
+  __macro(cudnnDestroyConvolutionDescriptor);              \
+  __macro(cudnnSetConvolutionNdDescriptor);                \
+  __macro(cudnnGetConvolutionNdDescriptor);                \
+  __macro(cudnnDeriveBNTensorDescriptor);                  \
+  __macro(cudnnCreateSpatialTransformerDescriptor);        \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
+  __macro(cudnnDestroySpatialTransformerDescriptor);       \
+  __macro(cudnnSpatialTfGridGeneratorForward);             \
+  __macro(cudnnSpatialTfGridGeneratorBackward);            \
+  __macro(cudnnSpatialTfSamplerForward);                   \
+  __macro(cudnnSpatialTfSamplerBackward);                  \
+  __macro(cudnnCreate);                                    \
+  __macro(cudnnDestroy);                                   \
+  __macro(cudnnSetStream);                                 \
+  __macro(cudnnActivationForward);                         \
+  __macro(cudnnActivationBackward);                        \
+  __macro(cudnnConvolutionForward);                        \
+  __macro(cudnnConvolutionBackwardBias);                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
+  __macro(cudnnTransformTensor);                           \
+  __macro(cudnnPoolingForward);                            \
+  __macro(cudnnPoolingBackward);                           \
+  __macro(cudnnSoftmaxBackward);                           \
+  __macro(cudnnSoftmaxForward);                            \
+  __macro(cudnnGetVersion);                                \
+  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
+  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
+  __macro(cudnnGetErrorString);                            \
+  __macro(cudnnCreateDropoutDescriptor);                   \
+  __macro(cudnnDropoutGetStatesSize);                      \
+  __macro(cudnnSetDropoutDescriptor);                      \
+  __macro(cudnnRestoreDropoutDescriptor);                  \
+  __macro(cudnnCreateRNNDescriptor);                       \
+  __macro(cudnnGetRNNParamsSize);                          \
+  __macro(cudnnGetRNNWorkspaceSize);                       \
+  __macro(cudnnGetRNNTrainingReserveSize);                 \
+  __macro(cudnnRNNForwardTraining);                        \
+  __macro(cudnnRNNBackwardData);                           \
+  __macro(cudnnRNNBackwardWeights);                        \
+  __macro(cudnnRNNForwardInference);                       \
+  __macro(cudnnDestroyDropoutDescriptor);                  \
+  __macro(cudnnDestroyRNNDescriptor);                      \
+  __macro(cudnnSetTensorNdDescriptorEx);                   \
+  __macro(cudnnAddTensor);                                 \
+  __macro(cudnnConvolutionBackwardData);                   \
+  __macro(cudnnConvolutionBackwardFilter);                 \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
+  __macro(cudnnBatchNormalizationForwardTraining);         \
+  __macro(cudnnBatchNormalizationForwardInference);        \
+  __macro(cudnnBatchNormalizationBackward);                \
+  __macro(cudnnCreateActivationDescriptor);                \
+  __macro(cudnnSetActivationDescriptor);                   \
+  __macro(cudnnGetActivationDescriptor);                   \
+  __macro(cudnnDestroyActivationDescriptor);               \
+  __macro(cudnnSetRNNDescriptor_v6);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);          \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
+  __macro(cudnnSetRNNDescriptor);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                \
+  __macro(cudnnSetConvolutionGroupCount);                 \
+  __macro(cudnnSetConvolutionMathType);                   \
+  __macro(cudnnConvolutionBiasActivationForward);         \
+  __macro(cudnnCreateCTCLossDescriptor);                  \
+  __macro(cudnnDestroyCTCLossDescriptor);                 \
+  __macro(cudnnGetCTCLossDescriptor);                     \
+  __macro(cudnnSetCTCLossDescriptor);                     \
+  __macro(cudnnGetCTCLossWorkspaceSize);                  \
+  __macro(cudnnCTCLoss);                                  \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
+  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 7201
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
+  __macro(cudnnCreateRNNDataDescriptor);             \
+  __macro(cudnnDestroyRNNDataDescriptor);            \
+  __macro(cudnnSetRNNDataDescriptor);                \
+  __macro(cudnnSetRNNPaddingMode);                   \
+  __macro(cudnnRNNForwardTrainingEx);                \
+  __macro(cudnnRNNBackwardDataEx);                   \
+  __macro(cudnnRNNBackwardWeightsEx);                \
+  __macro(cudnnRNNForwardInferenceEx);
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 7401
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
+  __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
+  __macro(cudnnBatchNormalizationForwardTrainingEx);                 \
+  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
+  __macro(cudnnBatchNormalizationBackwardEx);                        \
+  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 8000
+#define CUDNN_DNN_ROUTINE_EACH_R8(__macro)            \
+  __macro(cudnnSetRNNDescriptor_v8);                  \
+  __macro(cudnnCreateFusedOpsPlan);                   \
+  __macro(cudnnCreateFusedOpsConstParamPack);         \
+  __macro(cudnnCreateFusedOpsVariantParamPack);       \
+  __macro(cudnnDestroyFusedOpsPlan);                  \
+  __macro(cudnnDestroyFusedOpsConstParamPack);        \
+  __macro(cudnnDestroyFusedOpsVariantParamPack);      \
+  __macro(cudnnFusedOpsExecute);                      \
+  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
+  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
+  __macro(cudnnMakeFusedOpsPlan);
+CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
+
+#endif
--- a/paddle/pten/backends/dynload/cufft.cc
+++ b/paddle/pten/backends/dynload/cufft.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cufft.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag cufft_dso_flag;
+void* cufft_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUFFT() {
+  std::call_once(cufft_dso_flag,
+                 []() { cufft_dso_handle = GetCUFFTDsoHandle(); });
+  return cufft_dso_handle != nullptr;
+}
+
+void EnforceCUFFTLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      cufft_dso_handle,
+      paddle::platform::errors::PreconditionNotMet(
+          "Cannot load cufft shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cufft.h
+++ b/paddle/pten/backends/dynload/cufft.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cufft.h>
+#include <cufftXt.h>
+#include <glog/logging.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cufft_dso_flag;
+extern void* cufft_dso_handle;
+extern bool HasCUFFT();
+
+extern void EnforceCUFFTLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cufft_func = decltype(&::__name);                        \
+      std::call_once(cufft_dso_flag, []() {                          \
+        cufft_dso_handle = pten::dynload::GetCUFFTDsoHandle();       \
+      });                                                            \
+      EnforceCUFFTLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(cufft_dso_handle, #__name);    \
+      return reinterpret_cast<cufft_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cufft functions in HPPL
+ * different cufft version has different interfaces
+ **/
+#define CUFFT_FFT_ROUTINE_EACH(__macro)  \
+  __macro(cufftPlan1d);                  \
+  __macro(cufftPlan2d);                  \
+  __macro(cufftPlan3d);                  \
+  __macro(cufftPlanMany);                \
+  __macro(cufftMakePlan1d);              \
+  __macro(cufftMakePlan2d);              \
+  __macro(cufftMakePlan3d);              \
+  __macro(cufftMakePlanMany);            \
+  __macro(cufftMakePlanMany64);          \
+  __macro(cufftGetSizeMany64);           \
+  __macro(cufftEstimate1d);              \
+  __macro(cufftEstimate2d);              \
+  __macro(cufftEstimate3d);              \
+  __macro(cufftEstimateMany);            \
+  __macro(cufftCreate);                  \
+  __macro(cufftGetSize1d);               \
+  __macro(cufftGetSize2d);               \
+  __macro(cufftGetSize3d);               \
+  __macro(cufftGetSizeMany);             \
+  __macro(cufftGetSize);                 \
+  __macro(cufftSetWorkArea);             \
+  __macro(cufftSetAutoAllocation);       \
+  __macro(cufftExecC2C);                 \
+  __macro(cufftExecR2C);                 \
+  __macro(cufftExecC2R);                 \
+  __macro(cufftExecZ2Z);                 \
+  __macro(cufftExecD2Z);                 \
+  __macro(cufftExecZ2D);                 \
+  __macro(cufftSetStream);               \
+  __macro(cufftDestroy);                 \
+  __macro(cufftGetVersion);              \
+  __macro(cufftGetProperty);             \
+  __macro(cufftXtSetGPUs);               \
+  __macro(cufftXtMalloc);                \
+  __macro(cufftXtMemcpy);                \
+  __macro(cufftXtFree);                  \
+  __macro(cufftXtSetWorkArea);           \
+  __macro(cufftXtExecDescriptorC2C);     \
+  __macro(cufftXtExecDescriptorR2C);     \
+  __macro(cufftXtExecDescriptorC2R);     \
+  __macro(cufftXtExecDescriptorZ2Z);     \
+  __macro(cufftXtExecDescriptorD2Z);     \
+  __macro(cufftXtExecDescriptorZ2D);     \
+  __macro(cufftXtQueryPlan);             \
+  __macro(cufftXtSetCallback);           \
+  __macro(cufftXtClearCallback);         \
+  __macro(cufftXtSetCallbackSharedSize); \
+  __macro(cufftXtMakePlanMany);          \
+  __macro(cufftXtGetSizeMany);           \
+  __macro(cufftXtExec);                  \
+  __macro(cufftXtExecDescriptor);        \
+  __macro(cufftXtSetWorkAreaPolicy);
+
+CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
+
+}  // namespace dynload
+}  // namespace pten
+
+#endif
--- a/paddle/pten/backends/dynload/cupti.cc
+++ b/paddle/pten/backends/dynload/cupti.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include "paddle/pten/backends/dynload/cupti.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cupti_dso_flag;
+void *cupti_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUPTI_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
+
+#endif  // PADDLE_WITH_CUPTI
--- a/paddle/pten/backends/dynload/cupti.h
+++ b/paddle/pten/backends/dynload/cupti.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include <cuda.h>
+#include <cupti.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag cupti_dso_flag;
+extern void *cupti_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                   \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {        \
+      using cuptiFunc = decltype(&::__name);                      \
+      std::call_once(cupti_dso_flag, []() {                       \
+        cupti_dso_handle = pten::dynload::GetCUPTIDsoHandle();    \
+      });                                                         \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);    \
+    }                                                             \
+  };                                                              \
+  extern DynLoad__##__name __name
+
+#define CUPTI_ROUTINE_EACH(__macro)           \
+  __macro(cuptiActivityEnable);               \
+  __macro(cuptiActivityDisable);              \
+  __macro(cuptiActivityRegisterCallbacks);    \
+  __macro(cuptiActivityGetAttribute);         \
+  __macro(cuptiActivitySetAttribute);         \
+  __macro(cuptiGetTimestamp);                 \
+  __macro(cuptiActivityGetNextRecord);        \
+  __macro(cuptiGetResultString);              \
+  __macro(cuptiActivityGetNumDroppedRecords); \
+  __macro(cuptiActivityFlushAll);             \
+  __macro(cuptiSubscribe);                    \
+  __macro(cuptiUnsubscribe);                  \
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);
+
+CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
+}  // namespace dynload
+}  // namespace pten
+
+#endif  // PADDLE_WITH_CUPTI
--- a/paddle/pten/backends/dynload/cupti_lib_path.h.in
+++ b/paddle/pten/backends/dynload/cupti_lib_path.h.in
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
--- a/paddle/pten/backends/dynload/curand.cc
+++ b/paddle/pten/backends/dynload/curand.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/curand.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/curand.h
+++ b/paddle/pten/backends/dynload/curand.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <curand.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                   \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(Args... args) {                      \
+      using curandFunc = decltype(&::__name);                      \
+      std::call_once(curand_dso_flag, []() {                       \
+        curand_dso_handle = pten::dynload::GetCurandDsoHandle();   \
+      });                                                          \
+      static void *p_##__name = dlsym(curand_dso_handle, #__name); \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandGenerateNormal);               \
+  __macro(curandDestroyGenerator);
+
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cusolver.cc
+++ b/paddle/pten/backends/dynload/cusolver.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cusolver.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cusolver_dso_flag;
+void *cusolver_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUSOLVER_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef CUSOLVER_ROUTINE_EACH_R1
+CUSOLVER_ROUTINE_EACH_R1(DEFINE_WRAP);
+#endif
+
+#ifdef CUSOLVER_ROUTINE_EACH_R2
+CUSOLVER_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cusolver.h
+++ b/paddle/pten/backends/dynload/cusolver.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <cuda.h>
+#include <cusolverDn.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag cusolver_dso_flag;
+extern void *cusolver_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    cusolverStatus_t operator()(Args... args) {                      \
+      using cusolverFunc = decltype(&::__name);                      \
+      std::call_once(cusolver_dso_flag, []() {                       \
+        cusolver_dso_handle = pten::dynload::GetCusolverDsoHandle(); \
+      });                                                            \
+      static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
+      return reinterpret_cast<cusolverFunc>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define CUSOLVER_ROUTINE_EACH(__macro)  \
+  __macro(cusolverDnCreate);            \
+  __macro(cusolverDnDestroy);           \
+  __macro(cusolverDnSetStream);         \
+  __macro(cusolverDnSpotrf_bufferSize); \
+  __macro(cusolverDnDpotrf_bufferSize); \
+  __macro(cusolverDnSpotrf);            \
+  __macro(cusolverDnDpotrf);            \
+  __macro(cusolverDnSpotrs);            \
+  __macro(cusolverDnDpotrs);            \
+  __macro(cusolverDnCpotrs);            \
+  __macro(cusolverDnZpotrs);            \
+  __macro(cusolverDnSsyevd_bufferSize); \
+  __macro(cusolverDnDsyevd_bufferSize); \
+  __macro(cusolverDnCheevd_bufferSize); \
+  __macro(cusolverDnZheevd_bufferSize); \
+  __macro(cusolverDnSsyevd);            \
+  __macro(cusolverDnDsyevd);            \
+  __macro(cusolverDnCheevd);            \
+  __macro(cusolverDnZheevd);
+
+CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
+
+#if CUDA_VERSION >= 9020
+#define CUSOLVER_ROUTINE_EACH_R1(__macro) \
+  __macro(cusolverDnSpotrfBatched);       \
+  __macro(cusolverDnDpotrfBatched);       \
+  __macro(cusolverDnSpotrsBatched);       \
+  __macro(cusolverDnDpotrsBatched);       \
+  __macro(cusolverDnSgesvdj_bufferSize);  \
+  __macro(cusolverDnSgetrf_bufferSize);   \
+  __macro(cusolverDnDgetrf_bufferSize);   \
+  __macro(cusolverDnCgetrf_bufferSize);   \
+  __macro(cusolverDnZgetrf_bufferSize);   \
+  __macro(cusolverDnSgeqrf_bufferSize);   \
+  __macro(cusolverDnDgeqrf_bufferSize);   \
+  __macro(cusolverDnCgeqrf_bufferSize);   \
+  __macro(cusolverDnZgeqrf_bufferSize);   \
+  __macro(cusolverDnSorgqr_bufferSize);   \
+  __macro(cusolverDnDorgqr_bufferSize);   \
+  __macro(cusolverDnSormqr_bufferSize);   \
+  __macro(cusolverDnDormqr_bufferSize);   \
+  __macro(cusolverDnCungqr_bufferSize);   \
+  __macro(cusolverDnZungqr_bufferSize);   \
+  __macro(cusolverDnDestroyGesvdjInfo);   \
+  __macro(cusolverDnCreateGesvdjInfo);    \
+  __macro(cusolverDnDgesvdj_bufferSize);  \
+  __macro(cusolverDnSgesvdj);             \
+  __macro(cusolverDnDgesvdj);             \
+  __macro(cusolverDnSgetrf);              \
+  __macro(cusolverDnDgetrf);              \
+  __macro(cusolverDnCgetrf);              \
+  __macro(cusolverDnZgetrf);              \
+  __macro(cusolverDnSgeqrf);              \
+  __macro(cusolverDnDgeqrf);              \
+  __macro(cusolverDnCgeqrf);              \
+  __macro(cusolverDnZgeqrf);              \
+  __macro(cusolverDnSorgqr);              \
+  __macro(cusolverDnDorgqr);              \
+  __macro(cusolverDnSormqr);              \
+  __macro(cusolverDnDormqr);              \
+  __macro(cusolverDnCungqr);              \
+  __macro(cusolverDnZungqr);
+
+CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+#endif
+
+#if CUDA_VERSION >= 9020
+#define CUSOLVER_ROUTINE_EACH_R2(__macro) \
+  __macro(cusolverDnCreateSyevjInfo);     \
+  __macro(cusolverDnSsyevj_bufferSize);   \
+  __macro(cusolverDnDsyevj_bufferSize);   \
+  __macro(cusolverDnSsyevj);              \
+  __macro(cusolverDnDsyevj);              \
+  __macro(cusolverDnDestroySyevjInfo);
+
+CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
+#endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cusparse.cc
+++ b/paddle/pten/backends/dynload/cusparse.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/cusparse.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag cusparse_dso_flag;
+void *cusparse_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+#ifdef CUSPARSE_ROUTINE_EACH
+CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+#endif
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef CUSPARSE_ROUTINE_EACH_11020
+CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/cusparse.h
+++ b/paddle/pten/backends/dynload/cusparse.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <cuda.h>
+#include <cusparse.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag cusparse_dso_flag;
+extern void *cusparse_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    cusparseStatus_t operator()(Args... args) {                      \
+      using cusparseFunc = decltype(&::__name);                      \
+      std::call_once(cusparse_dso_flag, []() {                       \
+        cusparse_dso_handle = pten::dynload::GetCusparseDsoHandle(); \
+      });                                                            \
+      static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
+      return reinterpret_cast<cusparseFunc>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+#define CUSPARSE_ROUTINE_EACH(__macro) \
+  __macro(cusparseCreate);             \
+  __macro(cusparseSetStream);          \
+  __macro(cusparseCreateMatDescr);     \
+  __macro(cusparseDestroy);            \
+  __macro(cusparseSnnz);               \
+  __macro(cusparseDnnz);               \
+  __macro(cusparseSetMatType);         \
+  __macro(cusparseSetMatIndexBase);
+
+CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
+
+// APIs available after CUDA 11.2
+#if CUDA_VERSION >= 11020
+#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
+  __macro(cusparseCreateCsr);                \
+  __macro(cusparseCreateCoo);                \
+  __macro(cusparseCreateDnMat);              \
+  __macro(cusparseSpMM_bufferSize);          \
+  __macro(cusparseSpMM);                     \
+  __macro(cusparseDestroySpMat);             \
+  __macro(cusparseDestroyDnMat);             \
+  __macro(cusparseCooSetPointers);           \
+  __macro(cusparseCsrSetPointers);           \
+  __macro(cusparseDenseToSparse_bufferSize); \
+  __macro(cusparseDenseToSparse_analysis);   \
+  __macro(cusparseDenseToSparse_convert);    \
+  __macro(cusparseSparseToDense_bufferSize); \
+  __macro(cusparseSparseToDense);
+
+CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+
+// APIs available after CUDA 11.3
+#if CUDA_VERSION >= 11030
+#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
+  __macro(cusparseSDDMM_bufferSize);      \
+  __macro(cusparseSDDMM_preprocess);      \
+  __macro(cusparseSDDMM);
+
+CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+#endif
+#endif
+#endif
+#endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/dynamic_loader.cc
+++ b/paddle/pten/backends/dynload/dynamic_loader.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/cupti_lib_path.h"
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+// TODO(wilber): The pten computing library requires a component to manage flags
+// (maybe not use gflags).
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(cudnn_dir,
+              "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(
+    cuda_dir,
+    "",
+    "Specify path for loading cuda library, such as libcublas, libcublasLt "
+    "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
+    "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(nccl_dir,
+              "",
+              "Specify path for loading nccl library, such as libnccl.so. "
+              "For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(hccl_dir,
+              "",
+              "Specify path for loading hccl library, such as libhccl.so. "
+              "For instance, "
+              "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
+              "default, "
+              "dlopen will search hccl from LD_LIBRARY_PATH");
+
+DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+
+DEFINE_string(
+    tensorrt_dir,
+    "",
+    "Specify path for loading tensorrt library, such as libnvinfer.so.");
+
+DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
+DEFINE_string(mkl_dir,
+              "",
+              "Specify path for loading libmkl_rt.so. "
+              "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/."
+              "If default, "
+              "dlopen will search mkl from LD_LIBRARY_PATH");
+
+DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
+
+#ifdef PADDLE_WITH_HIP
+
+DEFINE_string(miopen_dir,
+              "",
+              "Specify path for loading libMIOpen.so. For instance, "
+              "/opt/rocm/miopen/lib. If empty [default], dlopen "
+              "will search miopen from LD_LIBRARY_PATH");
+
+DEFINE_string(rocm_dir,
+              "",
+              "Specify path for loading rocm library, such as librocblas, "
+              "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
+              "If default, dlopen will search rocm from LD_LIBRARY_PATH");
+
+DEFINE_string(rccl_dir,
+              "",
+              "Specify path for loading rccl library, such as librccl.so. "
+              "For instance, /opt/rocm/rccl/lib. If default, "
+              "dlopen will search rccl from LD_LIBRARY_PATH");
+#endif
+
+namespace pten {
+namespace dynload {
+
+struct PathNode {
+  PathNode() {}
+  std::string path = "";
+};
+
+static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
+
+// NOTE: In order to adapt to the default installation path of cuda
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
+#else
+static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";
+#endif
+
+static PathNode s_py_site_pkg_path;
+
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
+static constexpr char* win_cublas_lib =
+    "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
+#if CUDA_VERSION >= 11000
+static constexpr char* win_curand_lib =
+    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
+static constexpr char* win_cusolver_lib =
+    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
+static constexpr char* win_cusparse_lib =
+    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
+static constexpr char* win_cufft_lib =
+    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_10.dll";
+#else
+static constexpr char* win_curand_lib =
+    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cusolver_lib =
+    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cusparse_lib =
+    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cufft_lib =
+    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
+#endif  // CUDA_VERSION
+#endif
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
+  // directory separator
+  const char sep = '/';
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline std::vector<std::string> split(
+    const std::string& str, const std::string separator = " ") {
+  std::vector<std::string> str_list;
+  std::string::size_type firstPos;
+  firstPos = str.find_first_not_of(separator, 0);
+  std::string::size_type lastPos;
+  lastPos = str.find_first_of(separator, firstPos);
+  while (std::string::npos != firstPos && std::string::npos != lastPos) {
+    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
+    firstPos = str.find_first_not_of(separator, lastPos);
+    lastPos = str.find_first_of(separator, firstPos);
+  }
+  if (std::string::npos == lastPos) {
+    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
+  }
+  return str_list;
+}
+
+void SetPaddleLibPath(const std::string& py_site_pkg_path) {
+  s_py_site_pkg_path.path = py_site_pkg_path;
+  VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
+}
+
+static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
+                                                 const std::string& dso_name,
+                                                 int dynload_flags) {
+  void* dso_handle = nullptr;
+  if (!spec_path.empty()) {
+    // search xxx.so from custom path
+    VLOG(3) << "Try to find library: " << dso_name
+            << " from specific path: " << spec_path;
+    std::string dso_path = join(spec_path, dso_name);
+    dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  }
+  return dso_handle;
+}
+
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
+                                                int dynload_flags) {
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  // and /usr/local/lib path
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
+
+// TODO(chenweihang): This path is used to search which libs?
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
+  }
+#endif
+
+  return dso_handle;
+}
+
+/*
+ * We define three priorities for dynamic library search:
+ *
+ * First: Search for  path specified by the user
+ * Second: Search the stheystem default path
+ * Third: Search for a special path corresponding to
+ *        a specific library to adapt to changes and easy to expand.
+ */
+
+static inline void* GetDsoHandleFromSearchPath(
+    const std::string& config_path,
+    const std::string& dso_name,
+    bool throw_on_error = true,
+    const std::vector<std::string>& extra_paths = std::vector<std::string>(),
+    const std::string& warning_msg = std::string()) {
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+  std::vector<std::string> dso_names = split(dso_name, ";");
+  void* dso_handle = nullptr;
+  for (auto dso : dso_names) {
+    // 1. search in user config path by FLAGS
+    dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
+    // 2. search in system default path
+    if (nullptr == dso_handle) {
+      dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
+    }
+    // 3. search in extra paths
+    if (nullptr == dso_handle) {
+      for (auto path : extra_paths) {
+        VLOG(3) << "extra_paths: " << path;
+        dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
+      }
+    }
+    if (nullptr != dso_handle) break;
+  }
+
+  // 4. [If Failed for All dso_names] logging warning if exists
+  if (nullptr == dso_handle && !warning_msg.empty()) {
+    LOG(WARNING) << warning_msg;
+  }
+
+  // 5. [If Failed for All dso_names] logging or throw error info
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "The third-party dynamic library (%s) that Paddle depends on is not "
+        "configured correctly. (error code is %s)\n"
+        "  Suggestions:\n"
+        "  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
+        "is installed correctly and its version is matched with paddlepaddle "
+        "you installed.\n"
+        "  2. Configure third-party dynamic library environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
+        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
+        "impossible unless System Integrity Protection (SIP) is disabled.]";
+#if !defined(_WIN32)
+    auto errorno = dlerror();
+#else
+    auto errorno = GetLastError();
+#endif  // !_WIN32
+    if (throw_on_error) {
+      // NOTE: Special error report case, no need to change its format
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          error_msg, dso_name, errorno));
+    } else {
+      LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
+    }
+  }
+
+  return dso_handle;
+}
+
+void* GetCublasDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
+#endif
+}
+
+void* GetCublasLtDsoHandle() {
+// APIs available after CUDA 10.1
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
+#else
+  std::string warning_msg(
+      "Your CUDA_VERSION less 10.1, not support CublasLt. "
+      "If you want to use CublasLt, please upgrade CUDA and rebuild "
+      "PaddlePaddle.");
+  return nullptr;
+#endif
+}
+
+void* GetCUDNNDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string mac_warn_meg(
+      "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+      "For instance, sudo tar -xzf "
+      "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+      "chmod a+r /usr/local/cuda/include/cudnn.h "
+      "/usr/local/cuda/lib/libcudnn*");
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, "libcudnn.dylib", false, {}, mac_warn_meg);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  std::string win_warn_meg(
+      "Note: [Recommend] copy cudnn into CUDA installation directory. \n "
+      "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
+      "NVIDIA's official website, \n"
+      "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
+      "Toolkit\\CUDA\\v10.0\n"
+      "You should do this according to your CUDA installation directory and "
+      "CUDNN version.");
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
+#endif
+}
+
+void* GetCUPTIDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
+}
+
+void* GetCurandDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
+#endif
+}
+
+#ifdef PADDLE_WITH_HIP
+void* GetROCFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so");
+#endif
+}
+#endif
+
+void* GetNvjpegDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_nvjpeg_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
+#endif
+}
+
+void* GetCusolverDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
+#endif
+}
+
+void* GetCusparseDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
+#endif
+}
+
+void* GetNVRTCDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
+#endif
+}
+
+void* GetCUDADsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(_WIN32)
+  char system32_dir[MAX_PATH];
+  GetSystemDirectory(system32_dir, MAX_PATH);
+  return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
+#endif
+}
+
+void* GetWarpCTCDsoHandle() {
+  std::string warpctc_dir = "";
+  if (!s_py_site_pkg_path.path.empty()) {
+    warpctc_dir = s_py_site_pkg_path.path;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
+#else
+  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
+#endif
+}
+
+void* GetNCCLDsoHandle() {
+#ifdef PADDLE_WITH_HIP
+  std::string warning_msg(
+      "You may need to install 'rccl' from ROCM official website: "
+      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
+      "Installation-Guide.html before install PaddlePaddle.");
+#else
+  std::string warning_msg(
+      "You may need to install 'nccl2' from NVIDIA official website: "
+      "https://developer.nvidia.com/nccl/nccl-download"
+      "before install PaddlePaddle.");
+#endif
+
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
+#endif
+}
+void* GetHCCLDsoHandle() {
+  std::string warning_msg(
+      "You may need to install 'hccl2' from Huawei official website: "
+      "before install PaddlePaddle.");
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_hccl_dir, "libhccl.so", true, {}, warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
+#endif
+}
+
+void* GetTensorRtDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
+#endif
+}
+
+void* GetMKLMLDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
+#endif
+}
+
+void* GetLAPACKDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
+#endif
+}
+
+void* GetOpDsoHandle(const std::string& dso_name) {
+  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
+}
+
+void* GetNvtxDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("Nvtx do not support Apple."));
+#elif defined(_WIN32)
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("Nvtx do not support Windows."));
+#elif !defined(PADDLE_WITH_CUDA)
+  PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      "Nvtx do not support without CUDA."));
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
+#endif
+}
+
+void* GetCUFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
+#endif
+}
+
+void* GetMKLRTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
+#endif
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/dynamic_loader.h
+++ b/paddle/pten/backends/dynload/dynamic_loader.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+namespace pten {
+namespace dynload {
+
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
+
+void* GetCublasDsoHandle();
+void* GetCublasLtDsoHandle();
+void* GetCUDNNDsoHandle();
+void* GetCUPTIDsoHandle();
+void* GetCurandDsoHandle();
+void* GetNvjpegDsoHandle();
+void* GetCusolverDsoHandle();
+void* GetCusparseDsoHandle();
+void* GetNVRTCDsoHandle();
+void* GetCUDADsoHandle();
+void* GetWarpCTCDsoHandle();
+void* GetNCCLDsoHandle();
+void* GetHCCLDsoHandle();
+void* GetTensorRtDsoHandle();
+void* GetMKLMLDsoHandle();
+void* GetLAPACKDsoHandle();
+void* GetOpDsoHandle(const std::string& dso_name);
+void* GetNvtxDsoHandle();
+void* GetCUFFTDsoHandle();
+void* GetMKLRTDsoHandle();
+void* GetROCFFTDsoHandle();
+
+void SetPaddleLibPath(const std::string&);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hipfft.cc
+++ b/paddle/pten/backends/dynload/hipfft.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/hipfft.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag hipfft_dso_flag;
+void *hipfft_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hipfft.h
+++ b/paddle/pten/backends/dynload/hipfft.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_HIP
+#include <hipfft.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag hipfft_dso_flag;
+extern void *hipfft_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)                     \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using hipfftFunc = decltype(&::__name);                        \
+      std::call_once(hipfft_dso_flag, []() {                         \
+        hipfft_dso_handle = pten::dynload::GetROCFFTDsoHandle();     \
+      });                                                            \
+      static void *p_##__name = dlsym(hipfft_dso_handle, #__name);   \
+      return reinterpret_cast<hipfftFunc>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
+  __macro(hipfftPlan1d);                 \
+  __macro(hipfftPlan2d);                 \
+  __macro(hipfftPlan3d);                 \
+  __macro(hipfftPlanMany);               \
+  __macro(hipfftMakePlan1d);             \
+  __macro(hipfftMakePlanMany);           \
+  __macro(hipfftMakePlanMany64);         \
+  __macro(hipfftGetSizeMany64);          \
+  __macro(hipfftEstimate1d);             \
+  __macro(hipfftEstimate2d);             \
+  __macro(hipfftEstimate3d);             \
+  __macro(hipfftEstimateMany);           \
+  __macro(hipfftCreate);                 \
+  __macro(hipfftGetSize1d);              \
+  __macro(hipfftGetSizeMany);            \
+  __macro(hipfftGetSize);                \
+  __macro(hipfftSetWorkArea);            \
+  __macro(hipfftSetAutoAllocation);      \
+  __macro(hipfftExecC2C);                \
+  __macro(hipfftExecR2C);                \
+  __macro(hipfftExecC2R);                \
+  __macro(hipfftExecZ2Z);                \
+  __macro(hipfftExecD2Z);                \
+  __macro(hipfftExecZ2D);                \
+  __macro(hipfftSetStream);              \
+  __macro(hipfftDestroy);                \
+  __macro(hipfftGetVersion);             \
+  __macro(hipfftGetProperty);
+
+HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
+
+inline const char *hipfftGetErrorString(hipfftResult_t status) {
+  switch (status) {
+    case HIPFFT_SUCCESS:
+      return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
+    case HIPFFT_INVALID_PLAN:
+      return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
+    case HIPFFT_ALLOC_FAILED:
+      return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
+             "memory.";
+    case HIPFFT_INVALID_TYPE:
+      return "'HIPFFT_INVALID_TYPE'. No longer used.";
+    case HIPFFT_INVALID_VALUE:
+      return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
+             "parameter.";
+    case HIPFFT_INTERNAL_ERROR:
+      return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
+             "error.";
+    case HIPFFT_EXEC_FAILED:
+      return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
+    case HIPFFT_SETUP_FAILED:
+      return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
+    case HIPFFT_INVALID_SIZE:
+      return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
+    case HIPFFT_UNALIGNED_DATA:
+      return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
+    case HIPFFT_INCOMPLETE_PARAMETER_LIST:
+      return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
+    case HIPFFT_INVALID_DEVICE:
+      return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
+             "GPU than plan creation.";
+    case HIPFFT_PARSE_ERROR:
+      return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
+    case HIPFFT_NO_WORKSPACE:
+      return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
+             "plan execution.";
+    case HIPFFT_NOT_IMPLEMENTED:
+      return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
+             "functionality for parameters given.";
+    case HIPFFT_NOT_SUPPORTED:
+      return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
+             "parameters given.";
+    default:
+      return "HIPFFT_STATUS_UNKNOWN_ERROR";
+  }
+}
+}  // namespace dynload
+}  // namespace pten
+
+#endif
--- a/paddle/pten/backends/dynload/hiprand.cc
+++ b/paddle/pten/backends/dynload/hiprand.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/hiprand.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag hiprand_dso_flag;
+void *hiprand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hiprand.h
+++ b/paddle/pten/backends/dynload/hiprand.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <hiprand.h>
+
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/port.h"
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag hiprand_dso_flag;
+extern void *hiprand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    hiprandStatus_t operator()(Args... args) {                      \
+      using hiprandFunc = decltype(&::__name);                      \
+      std::call_once(hiprand_dso_flag, []() {                       \
+        hiprand_dso_handle = pten::dynload::GetCurandDsoHandle();   \
+      });                                                           \
+      static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \
+      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);    \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+
+#define HIPRAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(hiprandCreateGenerator);              \
+  __macro(hiprandSetStream);                    \
+  __macro(hiprandSetPseudoRandomGeneratorSeed); \
+  __macro(hiprandGenerateUniform);              \
+  __macro(hiprandGenerateUniformDouble);        \
+  __macro(hiprandGenerateNormal);               \
+  __macro(hiprandDestroyGenerator);
+
+HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hiprtc.cc
+++ b/paddle/pten/backends/dynload/hiprtc.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/hiprtc.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag hiprtc_dso_flag;
+void* hiprtc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() {
+  std::call_once(hiprtc_dso_flag,
+                 []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
+  return hiprtc_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/hiprtc.h
+++ b/paddle/pten/backends/dynload/hiprtc.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <hip/hiprtc.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag hiprtc_dso_flag;
+extern void* hiprtc_dso_handle;
+extern bool HasNVRTC();
+
+#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                     \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using hiprtc_func = decltype(&::__name);                       \
+      std::call_once(hiprtc_dso_flag, []() {                         \
+        hiprtc_dso_handle = pten::dynload::GetNVRTCDsoHandle();      \
+      });                                                            \
+      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);   \
+      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);     \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed hiprtc functions
+ **/
+#define HIPRTC_ROUTINE_EACH(__macro) \
+  __macro(hiprtcVersion);            \
+  __macro(hiprtcGetErrorString);     \
+  __macro(hiprtcCompileProgram);     \
+  __macro(hiprtcCreateProgram);      \
+  __macro(hiprtcDestroyProgram);     \
+  __macro(hiprtcGetCode);            \
+  __macro(hiprtcGetCodeSize);        \
+  __macro(hiprtcGetProgramLog);      \
+  __macro(hiprtcGetProgramLogSize)
+
+HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/lapack.cc
+++ b/paddle/pten/backends/dynload/lapack.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/lapack.h"
+#include <mutex>
+
+namespace pten {
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+LAPACK_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/lapack.h
+++ b/paddle/pten/backends/dynload/lapack.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <complex>
+#include <mutex>
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+// Note(zhouwei): because lapack doesn't provide appropriate header file.
+// should expose API statement yourself.
+
+// getrf_(For example)
+extern "C" void dgetrf_(
+    int *m, int *n, double *a, int *lda, int *ipiv, int *info);
+extern "C" void sgetrf_(
+    int *m, int *n, float *a, int *lda, int *ipiv, int *info);
+
+// evd
+extern "C" void zheevd_(char *jobz,
+                        char *uplo,
+                        int *n,
+                        std::complex<double> *a,
+                        int *lda,
+                        double *w,
+                        std::complex<double> *work,
+                        int *lwork,
+                        double *rwork,
+                        int *lrwork,
+                        int *iwork,
+                        int *liwork,
+                        int *info);
+extern "C" void cheevd_(char *jobz,
+                        char *uplo,
+                        int *n,
+                        std::complex<float> *a,
+                        int *lda,
+                        float *w,
+                        std::complex<float> *work,
+                        int *lwork,
+                        float *rwork,
+                        int *lrwork,
+                        int *iwork,
+                        int *liwork,
+                        int *info);
+extern "C" void dsyevd_(char *jobz,
+                        char *uplo,
+                        int *n,
+                        double *a,
+                        int *lda,
+                        double *w,
+                        double *work,
+                        int *lwork,
+                        int *iwork,
+                        int *liwork,
+                        int *info);
+extern "C" void ssyevd_(char *jobz,
+                        char *uplo,
+                        int *n,
+                        float *a,
+                        int *lda,
+                        float *w,
+                        float *work,
+                        int *lwork,
+                        int *iwork,
+                        int *liwork,
+                        int *info);
+
+// geev
+extern "C" void dgeev_(char *jobvl,
+                       char *jobvr,
+                       int *n,
+                       double *a,
+                       int *lda,
+                       double *wr,
+                       double *wi,
+                       double *vl,
+                       int *ldvl,
+                       double *vr,
+                       int *ldvr,
+                       double *work,
+                       int *lwork,
+                       int *info);
+extern "C" void sgeev_(char *jobvl,
+                       char *jobvr,
+                       int *n,
+                       float *a,
+                       int *lda,
+                       float *wr,
+                       float *wi,
+                       float *vl,
+                       int *ldvl,
+                       float *vr,
+                       int *ldvr,
+                       float *work,
+                       int *lwork,
+                       int *info);
+extern "C" void zgeev_(char *jobvl,
+                       char *jobvr,
+                       int *n,
+                       std::complex<double> *a,
+                       int *lda,
+                       std::complex<double> *w,
+                       std::complex<double> *vl,
+                       int *ldvl,
+                       std::complex<double> *vr,
+                       int *ldvr,
+                       std::complex<double> *work,
+                       int *lwork,
+                       double *rwork,
+                       int *info);
+extern "C" void cgeev_(char *jobvl,
+                       char *jobvr,
+                       int *n,
+                       std::complex<float> *a,
+                       int *lda,
+                       std::complex<float> *w,
+                       std::complex<float> *vl,
+                       int *ldvl,
+                       std::complex<float> *vr,
+                       int *ldvr,
+                       std::complex<float> *work,
+                       int *lwork,
+                       float *rwork,
+                       int *info);
+
+// gels
+extern "C" void dgels_(char *trans,
+                       int *m,
+                       int *n,
+                       int *nrhs,
+                       double *a,
+                       int *lda,
+                       double *b,
+                       int *ldb,
+                       double *work,
+                       int *lwork,
+                       int *info);
+extern "C" void sgels_(char *trans,
+                       int *m,
+                       int *n,
+                       int *nrhs,
+                       float *a,
+                       int *lda,
+                       float *b,
+                       int *ldb,
+                       float *work,
+                       int *lwork,
+                       int *info);
+
+// gelsd
+extern "C" void dgelsd_(int *m,
+                        int *n,
+                        int *nrhs,
+                        double *a,
+                        int *lda,
+                        double *b,
+                        int *ldb,
+                        double *s,
+                        double *rcond,
+                        int *rank,
+                        double *work,
+                        int *lwork,
+                        int *iwork,
+                        int *info);
+extern "C" void sgelsd_(int *m,
+                        int *n,
+                        int *nrhs,
+                        float *a,
+                        int *lda,
+                        float *b,
+                        int *ldb,
+                        float *s,
+                        float *rcond,
+                        int *rank,
+                        float *work,
+                        int *lwork,
+                        int *iwork,
+                        int *info);
+
+// gelsy
+extern "C" void dgelsy_(int *m,
+                        int *n,
+                        int *nrhs,
+                        double *a,
+                        int *lda,
+                        double *b,
+                        int *ldb,
+                        int *jpvt,
+                        double *rcond,
+                        int *rank,
+                        double *work,
+                        int *lwork,
+                        int *info);
+extern "C" void sgelsy_(int *m,
+                        int *n,
+                        int *nrhs,
+                        float *a,
+                        int *lda,
+                        float *b,
+                        int *ldb,
+                        int *jpvt,
+                        float *rcond,
+                        int *rank,
+                        float *work,
+                        int *lwork,
+                        int *info);
+
+// gelss
+extern "C" void dgelss_(int *m,
+                        int *n,
+                        int *nrhs,
+                        double *a,
+                        int *lda,
+                        double *b,
+                        int *ldb,
+                        double *s,
+                        double *rcond,
+                        int *rank,
+                        double *work,
+                        int *lwork,
+                        int *info);
+extern "C" void sgelss_(int *m,
+                        int *n,
+                        int *nrhs,
+                        float *a,
+                        int *lda,
+                        float *b,
+                        int *ldb,
+                        float *s,
+                        float *rcond,
+                        int *rank,
+                        float *work,
+                        int *lwork,
+                        int *info);
+
+extern "C" void zpotrs_(char *uplo,
+                        int *n,
+                        int *nrhs,
+                        std::complex<double> *a,
+                        int *lda,
+                        std::complex<double> *b,
+                        int *ldb,
+                        int *info);
+extern "C" void cpotrs_(char *uplo,
+                        int *n,
+                        int *nrhs,
+                        std::complex<float> *a,
+                        int *lda,
+                        std::complex<float> *b,
+                        int *ldb,
+                        int *info);
+extern "C" void dpotrs_(char *uplo,
+                        int *n,
+                        int *nrhs,
+                        double *a,
+                        int *lda,
+                        double *b,
+                        int *ldb,
+                        int *info);
+extern "C" void spotrs_(char *uplo,
+                        int *n,
+                        int *nrhs,
+                        float *a,
+                        int *lda,
+                        float *b,
+                        int *ldb,
+                        int *info);
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag lapack_dso_flag;
+extern void *lapack_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                             \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using lapackFunc = decltype(&::__name);                        \
+      std::call_once(lapack_dso_flag, []() {                         \
+        lapack_dso_handle = pten::dynload::GetLAPACKDsoHandle();     \
+      });                                                            \
+      static void *p_##_name = dlsym(lapack_dso_handle, #__name);    \
+      return reinterpret_cast<lapackFunc>(p_##_name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
+  DYNAMIC_LOAD_LAPACK_WRAP(__name)
+
+#define LAPACK_ROUTINE_EACH(__macro) \
+  __macro(dgetrf_);                  \
+  __macro(sgetrf_);                  \
+  __macro(zheevd_);                  \
+  __macro(cheevd_);                  \
+  __macro(dsyevd_);                  \
+  __macro(ssyevd_);                  \
+  __macro(dgeev_);                   \
+  __macro(sgeev_);                   \
+  __macro(zgeev_);                   \
+  __macro(cgeev_);                   \
+  __macro(dgels_);                   \
+  __macro(sgels_);                   \
+  __macro(dgelsd_);                  \
+  __macro(sgelsd_);                  \
+  __macro(dgelsy_);                  \
+  __macro(sgelsy_);                  \
+  __macro(dgelss_);                  \
+  __macro(sgelss_);                  \
+  __macro(zpotrs_);                  \
+  __macro(cpotrs_);                  \
+  __macro(dpotrs_);                  \
+  __macro(spotrs_);
+
+LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
+
+#undef DYNAMIC_LOAD_LAPACK_WRAP
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/miopen.cc
+++ b/paddle/pten/backends/dynload/miopen.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag miopen_dso_flag;
+void* miopen_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MIOPEN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+MIOPEN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R3
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R4
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R5
+MIOPEN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R6
+MIOPEN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R7
+MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R7
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
+#endif
+
+bool HasCUDNN() {
+  std::call_once(miopen_dso_flag,
+                 []() { miopen_dso_handle = GetCUDNNDsoHandle(); });
+  return miopen_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      miopen_dso_handle,
+      paddle::platform::errors::PreconditionNotMet(
+          "Cannot load miopen shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/miopen.h
+++ b/paddle/pten/backends/dynload/miopen.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <glog/logging.h>
+
+#include <miopen/miopen.h>
+#include <miopen/version.h>
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+#define MIOPEN_VERSION                                       \
+  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
+   MIOPEN_VERSION_PATCH)  // NOLINT
+
+// MIOPEN only support NCHW, just for compatibility with CUDNN API
+typedef enum {
+  MIOPEN_TENSOR_NCHW = 0,
+  MIOPEN_TENSOR_NHWC = 1,
+} miopenTensorFormat_t;
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag miopen_dso_flag;
+extern void* miopen_dso_handle;
+extern bool HasCUDNN();
+
+inline const char* miopenGetErrorString(miopenStatus_t status) {
+  switch (status) {
+    case miopenStatusSuccess:
+      return "MIOPEN_STATUS_SUCCESS";
+    case miopenStatusNotInitialized:
+      return "MIOPEN_STATUS_NOT_INITIALIZED";
+    case miopenStatusInvalidValue:
+      return "MIOPEN_STATUS_INVALID_VALUE";
+    case miopenStatusBadParm:
+      return "MIOPEN_STATUS_BAD_PARAM";
+    case miopenStatusAllocFailed:
+      return "MIOPEN_STATUS_ALLOC_FAILED";
+    case miopenStatusInternalError:
+      return "MIOPEN_STATUS_INTERNAL_ERROR";
+    case miopenStatusNotImplemented:
+      return "MIOPEN_STATUS_NOT_IMPLEMENTED";
+    case miopenStatusUnsupportedOp:
+      return "MIOPEN_STATUS_UNSUPPORTED_OP";
+    case miopenStatusUnknownError:
+    default:
+      return "MIOPEN_STATUS_UNKNOWN_ERROR";
+  }
+}
+
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)                     \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using miopen_func = decltype(&::__name);                       \
+      std::call_once(miopen_dso_flag, []() {                         \
+        miopen_dso_handle = pten::dynload::GetCUDNNDsoHandle();      \
+      });                                                            \
+      EnforceCUDNNLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(miopen_dso_handle, #__name);   \
+      return reinterpret_cast<miopen_func>(p_##__name)(args...);     \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed miopen functions in HPPL
+ **/
+#define MIOPEN_DNN_ROUTINE_EACH(__macro)                  \
+  __macro(miopenGetVersion);                              \
+  __macro(miopenOpTensor);                                \
+  __macro(miopenSet4dTensorDescriptor);                   \
+  __macro(miopenSetTensorDescriptor);                     \
+  __macro(miopenInitConvolutionNdDescriptor);             \
+  __macro(miopenFindConvolutionForwardAlgorithm);         \
+  __macro(miopenGetConvolutionNdForwardOutputDim);        \
+  __macro(miopenFindConvolutionBackwardDataAlgorithm);    \
+  __macro(miopenFindConvolutionBackwardWeightsAlgorithm); \
+  __macro(miopenGetTensorDescriptor);                     \
+  __macro(miopenCreateTensorDescriptor);                  \
+  __macro(miopenDestroyTensorDescriptor);                 \
+  __macro(miopenGetTensorDescriptorSize);                 \
+  __macro(miopenSet2dPoolingDescriptor);                  \
+  __macro(miopenGet2dPoolingDescriptor);                  \
+  __macro(miopenGetPoolingNdForwardOutputDim);            \
+  __macro(miopenCreateConvolutionDescriptor);             \
+  __macro(miopenCreatePoolingDescriptor);                 \
+  __macro(miopenDestroyPoolingDescriptor);                \
+  __macro(miopenPoolingGetWorkSpaceSize);                 \
+  __macro(miopenPoolingGetWorkSpaceSizeV2);               \
+  __macro(miopenSetNdPoolingDescriptor);                  \
+  __macro(miopenInitConvolutionDescriptor);               \
+  __macro(miopenDestroyConvolutionDescriptor);            \
+  __macro(miopenGetConvolutionNdDescriptor);              \
+  __macro(miopenDeriveBNTensorDescriptor);                \
+  __macro(miopenCreate);                                  \
+  __macro(miopenDestroy);                                 \
+  __macro(miopenSetStream);                               \
+  __macro(miopenActivationForward);                       \
+  __macro(miopenActivationBackward);                      \
+  __macro(miopenConvolutionBackwardWeights);              \
+  __macro(miopenConvolutionForward);                      \
+  __macro(miopenConvolutionForwardBias);                  \
+  __macro(miopenConvolutionBackwardBias);                 \
+  __macro(miopenConvolutionForwardGetWorkSpaceSize);      \
+  __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \
+  __macro(miopenTransformTensor);                         \
+  __macro(miopenPoolingForward);                          \
+  __macro(miopenPoolingBackward);                         \
+  __macro(miopenSoftmaxBackward);                         \
+  __macro(miopenSoftmaxBackward_V2);                      \
+  __macro(miopenSoftmaxForward);                          \
+  __macro(miopenSoftmaxForward_V2);                       \
+  __macro(miopenCreateDropoutDescriptor);                 \
+  __macro(miopenDestroyDropoutDescriptor);                \
+  __macro(miopenRestoreDropoutDescriptor);                \
+  __macro(miopenDropoutGetStatesSize);                    \
+  __macro(miopenSetDropoutDescriptor);                    \
+  __macro(miopenCreateRNNDescriptor);                     \
+  __macro(miopenDestroyRNNDescriptor);                    \
+  __macro(miopenSetRNNDescriptor);                        \
+  __macro(miopenSetRNNDescriptor_V2);                     \
+  __macro(miopenGetRNNParamsSize);                        \
+  __macro(miopenGetRNNWorkspaceSize);                     \
+  __macro(miopenGetRNNTrainingReserveSize);               \
+  __macro(miopenRNNForwardTraining);                      \
+  __macro(miopenRNNBackwardData);                         \
+  __macro(miopenRNNBackwardWeights);                      \
+  __macro(miopenRNNForwardInference);                     \
+  __macro(miopenGetTensorNumBytes);
+
+MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(miopenConvolutionBackwardData);
+MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+// APIs available after R3:
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
+  __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+// APIs available after R4:
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(miopenBatchNormalizationForwardTraining);  \
+  __macro(miopenBatchNormalizationForwardInference); \
+  __macro(miopenBatchNormalizationBackward);
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+// APIs in R5
+#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(miopenCreateActivationDescriptor); \
+  __macro(miopenSetActivationDescriptor);    \
+  __macro(miopenGetActivationDescriptor);    \
+  __macro(miopenDestroyActivationDescriptor);
+MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+// APIs in R6
+#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
+/*__macro(miopenSetRNNDescriptor_v6);*/
+MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(miopenSetConvolutionGroupCount);  \
+  __macro(miopenCreateCTCLossDescriptor);   \
+  __macro(miopenDestroyCTCLossDescriptor);  \
+  __macro(miopenGetCTCLossDescriptor);      \
+  __macro(miopenSetCTCLossDescriptor);      \
+  __macro(miopenGetCTCLossWorkspaceSize);   \
+  __macro(miopenCTCLoss);
+MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                    \
+/*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
+__macro(cudnnBatchNormalizationForwardTrainingEx);                   \
+__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);          \
+__macro(cudnnBatchNormalizationBackwardEx);                          \
+__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/mklml.cc
+++ b/paddle/pten/backends/dynload/mklml.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/mklml.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag mklml_dso_flag;
+void* mklml_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MKLML_ROUTINE_EACH(DEFINE_WRAP);
+
+#if !defined(_WIN32)
+DEFINE_WRAP(mkl_scsrmm);
+DEFINE_WRAP(mkl_dcsrmm);
+#endif
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/mklml.h
+++ b/paddle/pten/backends/dynload/mklml.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkl.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag mklml_dso_flag;
+extern void *mklml_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mklml routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                              \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using mklmlFunc = decltype(&::__name);                         \
+      std::call_once(mklml_dso_flag, []() {                          \
+        mklml_dso_handle = pten::dynload::GetMKLMLDsoHandle();       \
+      });                                                            \
+      static void *p_##_name = dlsym(mklml_dso_handle, #__name);     \
+      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);        \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+
+#define MKLML_ROUTINE_EACH(__macro) \
+  __macro(cblas_sgemm);             \
+  __macro(cblas_dgemm);             \
+  __macro(cblas_cgemm);             \
+  __macro(cblas_zgemm);             \
+  __macro(cblas_saxpy);             \
+  __macro(cblas_daxpy);             \
+  __macro(cblas_caxpy);             \
+  __macro(cblas_zaxpy);             \
+  __macro(cblas_scopy);             \
+  __macro(cblas_dcopy);             \
+  __macro(cblas_ccopy);             \
+  __macro(cblas_zcopy);             \
+  __macro(cblas_sgemv);             \
+  __macro(cblas_dgemv);             \
+  __macro(cblas_cgemv);             \
+  __macro(cblas_zgemv);             \
+  __macro(cblas_strsm);             \
+  __macro(cblas_dtrsm);             \
+  __macro(cblas_ctrsm);             \
+  __macro(cblas_ztrsm);             \
+  __macro(cblas_sgemm_alloc);       \
+  __macro(cblas_dgemm_alloc);       \
+  __macro(cblas_sgemm_pack);        \
+  __macro(cblas_dgemm_pack);        \
+  __macro(cblas_sgemm_compute);     \
+  __macro(cblas_dgemm_compute);     \
+  __macro(cblas_sgemm_free);        \
+  __macro(cblas_dgemm_free);        \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm_batch);       \
+  __macro(cblas_cgemm_batch);       \
+  __macro(cblas_zgemm_batch);       \
+  __macro(cblas_sdot);              \
+  __macro(cblas_ddot);              \
+  __macro(cblas_sasum);             \
+  __macro(cblas_dasum);             \
+  __macro(cblas_isamax);            \
+  __macro(cblas_idamax);            \
+  __macro(cblas_sscal);             \
+  __macro(cblas_dscal);             \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(vsSub);                   \
+  __macro(vdSub);                   \
+  __macro(vsMul);                   \
+  __macro(vdMul);                   \
+  __macro(vsDiv);                   \
+  __macro(vdDiv);                   \
+  __macro(vsExp);                   \
+  __macro(vdExp);                   \
+  __macro(vsSqr);                   \
+  __macro(vdSqr);                   \
+  __macro(vsPowx);                  \
+  __macro(vdPowx);                  \
+  __macro(vsInv);                   \
+  __macro(vdInv);                   \
+  __macro(vmsErf);                  \
+  __macro(vmdErf);                  \
+  __macro(MKL_Free_Buffers);        \
+  __macro(MKL_Set_Num_Threads);     \
+  __macro(MKL_Get_Max_Threads);
+
+MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+
+#if !defined(_WIN32)
+DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
+DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm);
+#endif
+
+#undef DYNAMIC_LOAD_MKLML_WRAP
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/mklrt.cc
+++ b/paddle/pten/backends/dynload/mklrt.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/mklrt.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag mklrt_dso_flag;
+void* mklrt_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MKLDFTI_ROUTINE_EACH(DEFINE_WRAP);
+
+DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc,
+                                           enum DFTI_CONFIG_VALUE prec,
+                                           enum DFTI_CONFIG_VALUE domain,
+                                           MKL_LONG dim,
+                                           MKL_LONG* sizes) {
+  if (prec == DFTI_SINGLE) {
+    if (dim == 1) {
+      return DftiCreateDescriptor_s_1d(desc, domain, sizes[0]);
+    } else {
+      return DftiCreateDescriptor_s_md(desc, domain, dim, sizes);
+    }
+  } else if (prec == DFTI_DOUBLE) {
+    if (dim == 1) {
+      return DftiCreateDescriptor_d_1d(desc, domain, sizes[0]);
+    } else {
+      return DftiCreateDescriptor_d_md(desc, domain, dim, sizes);
+    }
+  } else {
+    return DftiCreateDescriptor(desc, prec, domain, dim, sizes);
+  }
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/mklrt.h
+++ b/paddle/pten/backends/dynload/mklrt.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkl_dfti.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag mklrt_dso_flag;
+extern void* mklrt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mkldfti routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                              \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using mklrtFunc = decltype(&::__name);                         \
+      std::call_once(mklrt_dso_flag, []() {                          \
+        mklrt_dso_handle = pten::dynload::GetMKLRTDsoHandle();       \
+      });                                                            \
+      static void* p_##__name = dlsym(mklrt_dso_handle, #__name);    \
+      return reinterpret_cast<mklrtFunc>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+// mkl_dfti.h has a macro that shadows the function with the same name
+// un-defeine this macro so as to export that function
+#undef DftiCreateDescriptor
+
+#define MKLDFTI_ROUTINE_EACH(__macro) \
+  __macro(DftiCreateDescriptor);      \
+  __macro(DftiCreateDescriptor_s_1d); \
+  __macro(DftiCreateDescriptor_d_1d); \
+  __macro(DftiCreateDescriptor_s_md); \
+  __macro(DftiCreateDescriptor_d_md); \
+  __macro(DftiSetValue);              \
+  __macro(DftiGetValue);              \
+  __macro(DftiCommitDescriptor);      \
+  __macro(DftiComputeForward);        \
+  __macro(DftiComputeBackward);       \
+  __macro(DftiFreeDescriptor);        \
+  __macro(DftiErrorClass);            \
+  __macro(DftiErrorMessage);
+
+MKLDFTI_ROUTINE_EACH(DYNAMIC_LOAD_MKLRT_WRAP)
+
+#undef DYNAMIC_LOAD_MKLRT_WRAP
+
+// define another function to avoid naming conflict
+DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc,
+                                           enum DFTI_CONFIG_VALUE prec,
+                                           enum DFTI_CONFIG_VALUE domain,
+                                           MKL_LONG dim,
+                                           MKL_LONG* sizes);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/nccl.cc
+++ b/paddle/pten/backends/dynload/nccl.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/nccl.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag nccl_dso_flag;
+void *nccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if NCCL_VERSION_CODE >= 2212
+NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2304
+NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 21100
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/nccl.h
+++ b/paddle/pten/backends/dynload/nccl.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <nccl.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag nccl_dso_flag;
+extern void* nccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(&::__name);                     \
+      std::call_once(nccl_dso_flag, []() {                       \
+        nccl_dso_handle = pten::dynload::GetNCCLDsoHandle();     \
+      });                                                        \
+      static void* p_##__name = dlsym(nccl_dso_handle, #__name); \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define NCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
+  __macro(ncclReduce);                  \
+  __macro(ncclReduceScatter);           \
+  __macro(ncclGetErrorString);
+
+NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+
+#if NCCL_VERSION_CODE >= 2212
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
+NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2304
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
+NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(ncclSend);                               \
+  __macro(ncclRecv);
+NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 21100
+#define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
+  __macro(ncclRedOpCreatePreMulSum);                \
+  __macro(ncclRedOpDestroy);
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/nvjpeg.cc
+++ b/paddle/pten/backends/dynload/nvjpeg.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/nvjpeg.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag nvjpeg_dso_flag;
+void *nvjpeg_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/nvjpeg.h
+++ b/paddle/pten/backends/dynload/nvjpeg.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <nvjpeg.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag nvjpeg_dso_flag;
+extern void *nvjpeg_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)                   \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    nvjpegStatus_t operator()(Args... args) {                      \
+      using nvjpegFunc = decltype(&::__name);                      \
+      std::call_once(nvjpeg_dso_flag, []() {                       \
+        nvjpeg_dso_handle = pten::dynload::GetNvjpegDsoHandle();   \
+      });                                                          \
+      static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \
+      return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+
+#define NVJPEG_RAND_ROUTINE_EACH(__macro) \
+  __macro(nvjpegCreateSimple);            \
+  __macro(nvjpegJpegStateCreate);         \
+  __macro(nvjpegGetImageInfo);            \
+  __macro(nvjpegJpegStateDestroy);        \
+  __macro(nvjpegDecode);
+
+NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
+
+#endif
--- a/paddle/pten/backends/dynload/nvrtc.cc
+++ b/paddle/pten/backends/dynload/nvrtc.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/nvrtc.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag nvrtc_dso_flag;
+void* nvrtc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVRTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() {
+  std::call_once(nvrtc_dso_flag,
+                 []() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
+  return nvrtc_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/nvrtc.h
+++ b/paddle/pten/backends/dynload/nvrtc.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <nvrtc.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag nvrtc_dso_flag;
+extern void* nvrtc_dso_handle;
+extern bool HasNVRTC();
+
+#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using nvrtc_func = decltype(&::__name);                        \
+      std::call_once(nvrtc_dso_flag, []() {                          \
+        nvrtc_dso_handle = pten::dynload::GetNVRTCDsoHandle();       \
+      });                                                            \
+      static void* p_##__name = dlsym(nvrtc_dso_handle, #__name);    \
+      return reinterpret_cast<nvrtc_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed nvrtc functions
+ **/
+#define NVRTC_ROUTINE_EACH(__macro) \
+  __macro(nvrtcVersion);            \
+  __macro(nvrtcGetErrorString);     \
+  __macro(nvrtcCompileProgram);     \
+  __macro(nvrtcCreateProgram);      \
+  __macro(nvrtcDestroyProgram);     \
+  __macro(nvrtcGetPTX);             \
+  __macro(nvrtcGetPTXSize);         \
+  __macro(nvrtcGetProgramLog);      \
+  __macro(nvrtcGetProgramLogSize)
+
+NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/nvtx.cc
+++ b/paddle/pten/backends/dynload/nvtx.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef _WIN32
+#include "paddle/pten/backends/dynload/nvtx.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag nvtx_dso_flag;
+void *nvtx_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVTX_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
+#endif
--- a/paddle/pten/backends/dynload/nvtx.h
+++ b/paddle/pten/backends/dynload/nvtx.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef _WIN32
+#include <cuda.h>
+#include <nvToolsExt.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+extern std::once_flag nvtx_dso_flag;
+extern void *nvtx_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    int operator()(Args... args) {                               \
+      using nvtxFunc = decltype(&::__name);                      \
+      std::call_once(nvtx_dso_flag, []() {                       \
+        nvtx_dso_handle = pten::dynload::GetNvtxDsoHandle();     \
+      });                                                        \
+      static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \
+      return reinterpret_cast<nvtxFunc>(p_##__name)(args...);    \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define NVTX_ROUTINE_EACH(__macro) \
+  __macro(nvtxRangePushA);         \
+  __macro(nvtxRangePop);
+
+NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
+}  // namespace dynload
+}  // namespace pten
+#endif
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
--- a/paddle/pten/backends/dynload/rccl.cc
+++ b/paddle/pten/backends/dynload/rccl.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/rccl.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag rccl_dso_flag;
+void *rccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if NCCL_VERSION_CODE >= 2212
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/rccl.h
+++ b/paddle/pten/backends/dynload/rccl.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <rccl.h>
+
+#include <mutex>  // NOLINT
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag rccl_dso_flag;
+extern void* rccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(&::__name);                     \
+      std::call_once(rccl_dso_flag, []() {                       \
+        rccl_dso_handle = pten::dynload::GetNCCLDsoHandle();     \
+      });                                                        \
+      static void* p_##__name = dlsym(rccl_dso_handle, #__name); \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define RCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
+  __macro(ncclReduce);                  \
+  __macro(ncclReduceScatter);           \
+  __macro(ncclGetErrorString);
+
+RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+
+#if NCCL_VERSION_CODE >= 2212
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(ncclSend);                               \
+  __macro(ncclRecv);
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/rocblas.cc
+++ b/paddle/pten/backends/dynload/rocblas.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/rocblas.h"
+
+namespace pten {
+namespace dynload {
+std::once_flag rocblas_dso_flag;
+void *rocblas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+ROCBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R2
+ROCBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R3
+ROCBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R4
+ROCBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/rocblas.h
+++ b/paddle/pten/backends/dynload/rocblas.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <rocblas.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag rocblas_dso_flag;
+extern void *rocblas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)                   \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    rocblas_status operator()(Args... args) {                       \
+      using rocblas_func = decltype(&::__name);                     \
+      std::call_once(rocblas_dso_flag, []() {                       \
+        rocblas_dso_handle = pten::dynload::GetCublasDsoHandle();   \
+      });                                                           \
+      static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \
+      return reinterpret_cast<rocblas_func>(p_##__name)(args...);   \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+
+#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(rocblas_caxpy);                  \
+  __macro(rocblas_saxpy);                  \
+  __macro(rocblas_daxpy);                  \
+  __macro(rocblas_zaxpy);                  \
+  __macro(rocblas_sscal);                  \
+  __macro(rocblas_dscal);                  \
+  __macro(rocblas_scopy);                  \
+  __macro(rocblas_dcopy);                  \
+  __macro(rocblas_cgemv);                  \
+  __macro(rocblas_sgemv);                  \
+  __macro(rocblas_zgemv);                  \
+  __macro(rocblas_dgemv);                  \
+  __macro(rocblas_cgemm);                  \
+  __macro(rocblas_sgemm);                  \
+  __macro(rocblas_dgemm);                  \
+  __macro(rocblas_hgemm);                  \
+  __macro(rocblas_zgemm);                  \
+  __macro(rocblas_sgeam);                  \
+  __macro(rocblas_strsm);                  \
+  __macro(rocblas_dtrsm);                  \
+  __macro(rocblas_dgeam);                  \
+  __macro(rocblas_sgemm_batched);          \
+  __macro(rocblas_dgemm_batched);          \
+  __macro(rocblas_cgemm_batched);          \
+  __macro(rocblas_zgemm_batched);          \
+  __macro(rocblas_create_handle);          \
+  __macro(rocblas_destroy_handle);         \
+  __macro(rocblas_set_stream);             \
+  __macro(rocblas_get_stream);             \
+  __macro(rocblas_set_pointer_mode);       \
+  __macro(rocblas_get_pointer_mode);
+
+ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+
+// APIs available after CUDA 8.0
+#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(rocblas_gemm_ex);                   \
+  __macro(rocblas_sgemm_strided_batched);     \
+  __macro(rocblas_dgemm_strided_batched);     \
+  __macro(rocblas_cgemm_strided_batched);     \
+  __macro(rocblas_zgemm_strided_batched);     \
+  __macro(rocblas_hgemm_strided_batched);
+
+ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+
+// HIP not supported in ROCM3.5
+// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
+//   __macro(cublasSetMathMode);
+//   __macro(cublasGetMathMode);
+// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+
+#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(rocblas_gemm_batched_ex);           \
+  __macro(rocblas_gemm_strided_batched_ex);
+
+ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
+
+#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/rocm_driver.cc
+++ b/paddle/pten/backends/dynload/rocm_driver.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/rocm_driver.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag rocm_dso_flag;
+void* rocm_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+ROCM_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() {
+  std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
+  return rocm_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/rocm_driver.h
+++ b/paddle/pten/backends/dynload/rocm_driver.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag rocm_dso_flag;
+extern void* rocm_dso_handle;
+extern bool HasCUDADriver();
+
+#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)                       \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using rocm_func = decltype(&::__name);                         \
+      std::call_once(rocm_dso_flag, []() {                           \
+        rocm_dso_handle = pten::dynload::GetCUDADsoHandle();         \
+      });                                                            \
+      static void* p_##__name = dlsym(rocm_dso_handle, #__name);     \
+      return reinterpret_cast<rocm_func>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cuda driver functions
+ **/
+#define ROCM_ROUTINE_EACH(__macro)                            \
+  __macro(hipDriverGetVersion);                               \
+  __macro(hipGetErrorString);                                 \
+  __macro(hipModuleLoadData);                                 \
+  __macro(hipModuleGetFunction);                              \
+  __macro(hipModuleUnload);                                   \
+  /*rocm3.5 not support the function*/                        \
+  /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \
+  __macro(hipModuleLaunchKernel);                             \
+  __macro(hipLaunchKernel);                                   \
+  __macro(hipGetDevice);                                      \
+  __macro(hipGetDeviceCount);                                 \
+  __macro(hipDevicePrimaryCtxGetState)
+
+ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/tensorrt.cc
+++ b/paddle/pten/backends/dynload/tensorrt.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/pten/backends/dynload/tensorrt.h"
+#include <string>
+
+namespace pten {
+namespace dynload {
+
+std::once_flag tensorrt_dso_flag;
+void* tensorrt_dso_handle;
+
+std::once_flag tensorrt_plugin_dso_flag;
+void* tensorrt_plugin_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP);
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+void* GetDsoHandle(const std::string& dso_name) {
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+
+  void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
+
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "You are using Paddle compiled with TensorRT, but TensorRT dynamic "
+        "library is not found. Ignore this if TensorRT is not needed.\n"
+        "The TensorRT that Paddle depends on is not configured correctly.\n"
+        "  Suggestions:\n"
+        "  1. Check if the TensorRT is installed correctly and its version"
+        " is matched with paddlepaddle you installed.\n"
+        "  2. Configure environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...`\n";
+    LOG(WARNING) << error_msg;
+  }
+  return dso_handle;
+}
+
+void* GetTensorRtHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer.dll";
+#else
+  std::string dso_name = "libnvinfer.so";
+#endif
+  return GetDsoHandle(dso_name);
+}
+
+void* GetTensorRtPluginHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer_plugin.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer_plugin.dll";
+#else
+  std::string dso_name = "libnvinfer_plugin.so";
+#endif
+  return GetDsoHandle(dso_name);
+}
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/tensorrt.h
+++ b/paddle/pten/backends/dynload/tensorrt.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <NvInfer.h>
+#include <NvInferPlugin.h>
+#if !defined(_WIN32)
+#include <dlfcn.h>
+#endif
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+
+namespace pten {
+namespace dynload {
+
+void* GetTensorRtHandle();
+
+extern std::once_flag tensorrt_dso_flag;
+extern void* tensorrt_dso_handle;
+
+void* GetTensorRtPluginHandle();
+extern std::once_flag tensorrt_plugin_dso_flag;
+extern void* tensorrt_plugin_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)             \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    void* operator()(Args... args) {                                   \
+      std::call_once(tensorrt_dso_flag, []() {                         \
+        tensorrt_dso_handle = pten::dynload::GetTensorRtHandle();      \
+      });                                                              \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);   \
+      if (p_##__name == nullptr) {                                     \
+        return nullptr;                                                \
+      }                                                                \
+      using tensorrt_func = decltype(&::__name);                       \
+      auto ret = reinterpret_cast<tensorrt_func>(p_##__name)(args...); \
+      return static_cast<void*>(ret);                                  \
+    }                                                                  \
+  };                                                                   \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)              \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
+      std::call_once(tensorrt_dso_flag, []() {                              \
+        tensorrt_dso_handle = pten::dynload::GetTensorRtHandle();           \
+      });                                                                   \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);        \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                   \
+                              paddle::platform::errors::Unavailable(        \
+                                  "Load tensorrt api %s failed", #__name)); \
+      using tensorrt_func = decltype(&::__name);                            \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);          \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
+      std::call_once(tensorrt_plugin_dso_flag, []() {                          \
+        tensorrt_plugin_dso_handle = pten::dynload::GetTensorRtPluginHandle(); \
+      });                                                                      \
+      static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name);    \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                      \
+                              paddle::platform::errors::Unavailable(           \
+                                  "Load tensorrt plugin %s failed", #__name)); \
+      using tensorrt_plugin_func = decltype(&::__name);                        \
+      return reinterpret_cast<tensorrt_plugin_func>(p_##__name)(args...);      \
+    }                                                                          \
+  };                                                                           \
+  extern DynLoad__##__name __name
+
+#ifdef NV_TENSORRT_MAJOR
+
+#if (NV_TENSORRT_MAJOR >= 6)
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
+  __macro(createInferRuntime_INTERNAL);             \
+  __macro(getPluginRegistry);
+#else
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
+  __macro(createInferRuntime_INTERNAL);
+#endif
+
+#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \
+  __macro(getInferLibVersion);
+
+#define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
+  __macro(initLibNvInferPlugins);
+
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
+
+#endif  // end of NV_TENSORRT_MAJOR
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/warpctc.cc
+++ b/paddle/pten/backends/dynload/warpctc.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/dynload/warpctc.h"
+
+namespace pten {
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace pten
--- a/paddle/pten/backends/dynload/warpctc.h
+++ b/paddle/pten/backends/dynload/warpctc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/pten/backends/dynload/dynamic_loader.h"
+#include "paddle/pten/backends/dynload/port.h"
+#include "warpctc/include/ctc.h"
+
+namespace pten {
+namespace dynload {
+
+extern std::once_flag warpctc_dso_flag;
+extern void* warpctc_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using warpctcFunc = decltype(&::__name);                       \
+      std::call_once(warpctc_dso_flag, []() {                        \
+        warpctc_dso_handle = pten::dynload::GetWarpCTCDsoHandle();   \
+      });                                                            \
+      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);   \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
+  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
+
+#define WARPCTC_ROUTINE_EACH(__macro) \
+  __macro(get_warpctc_version);       \
+  __macro(ctcGetStatusString);        \
+  __macro(compute_ctc_loss);          \
+  __macro(compute_ctc_loss_double);   \
+  __macro(get_workspace_size);        \
+  __macro(get_workspace_size_double)
+
+WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+}  // namespace dynload
+}  // namespace pten