Refine CUDA Related libraries

a0466053 · Yu Yang · 67bbcbbc · a0466053 · a0466053 · a0466053
10 changed file
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
 add_subdirectory(dynload)
-nv_test(cuda_test SRCS cuda_test.cu)
+nv_test(cuda_test SRCS cuda_test.cu DEPS dyload_cuda)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+IF(WITH_GPU)
+    set(GPU_CTX_DEPS dyload_cuda dynamic_loader )
+ELSE()
+    set(GPU_CTX_DEPS)
+ENDIF()
-nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 glog gflags)
+cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags)
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
@@ -28,19 +28,19 @@ inline void throw_on_error(cudaError_t e, const char* message) {
  }
 }
-int GetDeviceCount(void) {
+inline int GetDeviceCount(void) {
  int count;
  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
  return count;
 }
-int GetCurrentDeviceId(void) {
+inline int GetCurrentDeviceId(void) {
  int device_id;
  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
  return device_id;
 }
-void SetDeviceId(int device_id) {
+inline void SetDeviceId(int device_id) {
  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
 }

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
+#include <paddle/platform/device_context.h>
+namespace paddle {
+namespace platform {
+namespace dynload {
+namespace dummy {
+// Make DeviceContext A library.
+int DUMMY_VAR_FOR_DEV_CTX = 0;
+}  // namespace dummy
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+nv_library(dyload_cuda SRCS cublas.cc cudnn.cc curand.cc)
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
+#include <paddle/platform/dynload/cublas.h>
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name;
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cublas_dso_flag;
+extern std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
+extern void *cublas_dso_handle;
 /**
 * The following macro definition can generate structs
@@ -34,10 +34,10 @@ void *cublas_dso_handle = nullptr;
 * note: default dynamic linked libs
 */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
  struct DynLoad__##__name {                                        \
    template <typename... Args>                                     \
-    cublasStatus_t operator()(Args... args) {                       \
+    inline cublasStatus_t operator()(Args... args) {                \
      typedef cublasStatus_t (*cublasFunc)(Args...);                \
      std::call_once(cublas_dso_flag,                               \
                     paddle::platform::dynload::GetCublasDsoHandle, \
@@ -45,62 +45,43 @@ void *cublas_dso_handle = nullptr;
      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
    }                                                               \
-  } __name;  // struct DynLoad__##__name
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
-  struct DynLoad__##__name {                  \
+  struct DynLoad__##__name {                     \
-    template <typename... Args>               \
+    inline template <typename... Args>           \
-    cublasStatus_t operator()(Args... args) { \
+    cublasStatus_t operator()(Args... args) {    \
-      return __name(args...);                 \
+      return __name(args...);                    \
-    }                                         \
+    }                                            \
-  } __name;  // struct DynLoad__##__name
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-// include all needed cublas functions in HPPL
-// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
+  __macro(cublasSgemv);                   \
-  __macro(cublasDgemv)                    \
+  __macro(cublasDgemv);                   \
-  __macro(cublasSgemm)                    \
+  __macro(cublasSgemm);                   \
-  __macro(cublasDgemm)                    \
+  __macro(cublasDgemm);                   \
-  __macro(cublasSgeam)                    \
+  __macro(cublasSgeam);                   \
-  __macro(cublasDgeam)                    \
+  __macro(cublasDgeam);                   \
+  __macro(cublasCreate);                  \
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
+  __macro(cublasDestroy);                 \
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
+  __macro(cublasSetStream);               \
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
+  __macro(cublasSetPointerMode);          \
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
+  __macro(cublasGetPointerMode);          \
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
+  __macro(cublasSgemmBatched);            \
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
+  __macro(cublasDgemmBatched);            \
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
+  __macro(cublasCgemmBatched);            \
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
+  __macro(cublasZgemmBatched);            \
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
+  __macro(cublasSgetrfBatched);           \
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
+  __macro(cublasSgetriBatched);           \
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
+  __macro(cublasDgetrfBatched);           \
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
+  __macro(cublasDgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
-// clang-format on
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM paddle::platform::dynload::cublasSgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasSgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasSgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM paddle::platform::dynload::cublasDgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasDgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasDgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched
-#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
+#include <paddle/platform/dynload/cudnn.h>
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_R5
+CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -23,12 +23,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag cudnn_dso_flag;
+extern std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
+extern void* cudnn_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
  struct DynLoad__##__name {                                       \
    template <typename... Args>                                    \
    auto operator()(Args... args) -> decltype(__name(args...)) {   \
@@ -39,17 +39,19 @@ void* cudnn_dso_handle = nullptr;
      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
    }                                                              \
-  } __name; /* struct DynLoad__##__name */
+  };                                                               \
+  extern struct DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                  \
  struct DynLoad__##__name {                                     \
    template <typename... Args>                                  \
    auto operator()(Args... args) -> decltype(__name(args...)) { \
      return __name(args...);                                    \
    }                                                            \
-  } __name; /* struct DynLoad__##__name */
+  };                                                             \
+  extern DynLoad__##__name __name
 #endif
@@ -57,80 +59,73 @@ void* cudnn_dso_handle = nullptr;
 * include all needed cudnn functions in HPPL
 * different cudnn version has different interfaces
 **/
-// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
+  __macro(cudnnSetTensor4dDescriptor);              \
-  __macro(cudnnSetTensor4dDescriptor)                     \
+  __macro(cudnnSetTensor4dDescriptorEx);            \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
+  __macro(cudnnGetConvolutionForwardAlgorithm);     \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
+  __macro(cudnnCreateTensorDescriptor);             \
-  __macro(cudnnCreateTensorDescriptor)                    \
+  __macro(cudnnDestroyTensorDescriptor);            \
-  __macro(cudnnDestroyTensorDescriptor)                   \
+  __macro(cudnnCreateFilterDescriptor);             \
-  __macro(cudnnCreateFilterDescriptor)                    \
+  __macro(cudnnSetFilter4dDescriptor);              \
-  __macro(cudnnSetFilter4dDescriptor)                     \
+  __macro(cudnnSetPooling2dDescriptor);             \
-  __macro(cudnnSetPooling2dDescriptor)                    \
+  __macro(cudnnDestroyFilterDescriptor);            \
-  __macro(cudnnDestroyFilterDescriptor)                   \
+  __macro(cudnnCreateConvolutionDescriptor);        \
-  __macro(cudnnCreateConvolutionDescriptor)               \
+  __macro(cudnnCreatePoolingDescriptor);            \
-  __macro(cudnnCreatePoolingDescriptor)                   \
+  __macro(cudnnDestroyPoolingDescriptor);           \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
+  __macro(cudnnSetConvolution2dDescriptor);         \
-  __macro(cudnnSetConvolution2dDescriptor)                \
+  __macro(cudnnDestroyConvolutionDescriptor);       \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
+  __macro(cudnnCreate);                             \
-  __macro(cudnnCreate)                                    \
+  __macro(cudnnDestroy);                            \
-  __macro(cudnnDestroy)                                   \
+  __macro(cudnnSetStream);                          \
-  __macro(cudnnSetStream)                                 \
+  __macro(cudnnActivationForward);                  \
-  __macro(cudnnActivationForward)                         \
+  __macro(cudnnConvolutionForward);                 \
-  __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias);            \
-  __macro(cudnnConvolutionBackwardBias)                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
+  __macro(cudnnTransformTensor);                    \
-  __macro(cudnnTransformTensor)                           \
+  __macro(cudnnPoolingForward);                     \
-  __macro(cudnnPoolingForward)                            \
+  __macro(cudnnPoolingBackward);                    \
-  __macro(cudnnPoolingBackward)                           \
+  __macro(cudnnSoftmaxBackward);                    \
-  __macro(cudnnSoftmaxBackward)                           \
+  __macro(cudnnSoftmaxForward);                     \
-  __macro(cudnnSoftmaxForward)                            \
+  __macro(cudnnGetVersion);                         \
-  __macro(cudnnGetVersion)                                \
+  __macro(cudnnGetErrorString);
-  __macro(cudnnGetErrorString)
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
+  __macro(cudnnAddTensor);                 \
-  __macro(cudnnAddTensor)                                 \
+  __macro(cudnnConvolutionBackwardData);   \
-  __macro(cudnnConvolutionBackwardData)                   \
+  __macro(cudnnConvolutionBackwardFilter);
-  __macro(cudnnConvolutionBackwardFilter)
+CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
 #endif
 // APIs available after R4:
 #if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
+  __macro(cudnnBatchNormalizationForwardTraining);  \
-  __macro(cudnnBatchNormalizationForwardInference)           \
+  __macro(cudnnBatchNormalizationForwardInference); \
-  __macro(cudnnBatchNormalizationBackward)
+  __macro(cudnnBatchNormalizationBackward);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
 #endif
 // APIs in R5
 #if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
-  __macro(cudnnCreateActivationDescriptor)                    \
+  __macro(cudnnCreateActivationDescriptor); \
-  __macro(cudnnSetActivationDescriptor)                       \
+  __macro(cudnnSetActivationDescriptor);    \
-  __macro(cudnnGetActivationDescriptor)                       \
+  __macro(cudnnGetActivationDescriptor);    \
-  __macro(cudnnDestroyActivationDescriptor)
+  __macro(cudnnDestroyActivationDescriptor);
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
 #endif
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/dynload/curand.cc
+++ b/paddle/platform/dynload/curand.cc
+#include <paddle/platform/dynload/curand.h>
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+}
+}
+}
\ No newline at end of file
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag curand_dso_flag;
+extern std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
+extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
  struct DynLoad__##__name {                                        \
    template <typename... Args>                                     \
    curandStatus_t operator()(Args... args) {                       \
@@ -36,32 +36,29 @@ void *curand_dso_handle = nullptr;
      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
    }                                                               \
-  } __name; /* struct DynLoad__##__name */
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
-  struct DynLoad__##__name {                  \
+  struct DynLoad__##__name {                     \
-    template <typename... Args>               \
+    template <typename... Args>                  \
-    curandStatus_t operator()(Args... args) { \
+    curandStatus_t operator()(Args... args) {    \
-      return __name(args...);                 \
+      return __name(args...);                    \
-    }                                         \
+    }                                            \
-  } __name; /* struct DynLoad__##__name */
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
-/* include all needed curand functions in HPPL */
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
-// clang-format off
+  __macro(curandCreateGenerator);              \
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
+  __macro(curandSetStream);                    \
-  __macro(curandCreateGenerator)             \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
-  __macro(curandSetStream)                   \
+  __macro(curandGenerateUniform);              \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
+  __macro(curandGenerateUniformDouble);        \
-  __macro(curandGenerateUniform)             \
+  __macro(curandDestroyGenerator);
-  __macro(curandGenerateUniformDouble)       \
-  __macro(curandDestroyGenerator)
-// clang-format on
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle