diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 36655508be2ea9e748333171073c7dc258de52f2..3abbcdb71d03eaf6f8eba3d97150d27ac5a5405e 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -45,6 +45,9 @@ void gemm( const half* h_B = reinterpret_cast(B); half* h_C = reinterpret_cast(C); + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); PADDLE_ENFORCE(platform::dynload::cublasHgemm( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, N)); @@ -106,6 +109,9 @@ void gemm( const half* h_B = reinterpret_cast(B); half* h_C = reinterpret_cast(C); + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); PADDLE_ENFORCE(platform::dynload::cublasHgemm( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, ldc)); @@ -251,6 +257,9 @@ void batched_gemm( const half* h_B = reinterpret_cast(B); half* h_C = reinterpret_cast(C); + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 442e62d563ebd40316d001914c93447c102cbf61..8982d9d066165a9da0461288685baa0c60e5f114 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -72,6 +72,11 @@ TEST(math_function, notrans_mul_trans_fp16) { CUDAPlace gpu_place(0); CUDADeviceContext context(gpu_place); + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); @@ -149,6 +154,11 @@ TEST(math_function, trans_mul_notrans_fp16) { CUDAPlace gpu_place(0); CUDADeviceContext context(gpu_place); + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); @@ -248,6 +258,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) { CUDAPlace gpu_place(0); CUDADeviceContext context(gpu_place); + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + int m = 2; int n = 3; int k = 3; @@ -355,6 +370,11 @@ TEST(math_function, gemm_trans_cublas_fp16) { CUDAPlace gpu_place(0); CUDADeviceContext context(gpu_place); + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + int m = 2; int n = 3; int k = 3; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index bb9fbd468f38fffc94107e321e777fc0e772fbe6..98b4178177b0a8bafd6fe34a92be2a07a2fbc5a7 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { SetDeviceId(place_.device); + compute_capability = GetCUDAComputeCapability(place_.device); multi_process = GetCUDAMultiProcessors(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); @@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const { PADDLE_ENFORCE(cudaGetLastError()); } +int CUDADeviceContext::GetComputeCapability() const { + return compute_capability; +} + int CUDADeviceContext::GetMaxPhysicalThreadCount() const { return multi_process * max_threads_per_mp; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e779644190de1246cd650fbf91eeaeb03494643f..603b890af13b529c490c29112a73a09cc815d07a 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return place in the device context. */ Place GetPlace() const override; + /*! \brief Return compute capability in the device context. */ + int GetComputeCapability() const; + /*! \brief Return the max physical thread count in the device context */ int GetMaxPhysicalThreadCount() const; @@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext { cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; + int compute_capability; int multi_process; int max_threads_per_mp; }; diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index da4041bad0d82fe1c8c7a12fd0c7177e6dbddef3..dd70ff9ff574b32bc96a9e8255b1bf77a5cc84e4 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -33,6 +33,15 @@ int GetCUDADeviceCount() { return count; } +int GetCUDAComputeCapability(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + cudaDeviceProp device_prop; + PADDLE_ENFORCE(cudaGetDeviceProperties(&device_prop, id), + "cudaGetDeviceProperties failed in " + "paddle::platform::GetCUDAComputeCapability"); + return device_prop.major * 10 + device_prop.minor; +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index c38ccf0f2ade1d2405177b541b33fd84283726ff..fa469fa77f5ca780da153cc87da8d04f239711f3 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse = //! Get the total number of GPU devices in system. int GetCUDADeviceCount(); +//! Get the compute capability of the ith GPU (format: major * 10 + minor) +int GetCUDAComputeCapability(int i); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i);