fix gemm compute_type (#47613)

954be40d · sneaxiy · GitHub · b8ae3858 · 954be40d
隐藏空白更改
内联并排

Showing with 9 addition and 1 deletion

paddle/phi/kernels/funcs/blas/blas_impl.cu.h paddle/phi/kernels/funcs/blas/blas_impl.cu.h +9 -1

未找到文件。
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -1452,7 +1452,11 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
            << FLAGS_gemm_use_half_precision_compute_type;

    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
-    cudaDataType_t compute_type = CUDA_R_32F;
+#if CUDA_VERSION >= 11000
+    auto compute_type = CUBLAS_COMPUTE_32F;
+#else
+    auto compute_type = CUDA_R_32F;
+#endif

    float h_alpha = static_cast<float>(alpha);
    float h_beta = static_cast<float>(beta);
@@ -1463,7 +1467,11 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
        std::is_same<T, phi::dtype::float16>::value) {
      a = static_cast<void *>(&alpha);
      b = static_cast<void *>(&beta);
+#if CUDA_VERSION >= 11000
+      compute_type = CUBLAS_COMPUTE_16F;
+#else
      compute_type = CUDA_R_16F;
+#endif
    }

    context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {