sparse attention kernel is used from 11.8 (#47594)

7648f429 · zhouweiwei2014 · GitHub · 5fc92943 · 7648f429 · 7648f429
9 changed file
--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -64,7 +64,7 @@ CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif

-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
 #define CUSPARSE_ROUTINE_EACH_R3(__macro) \
  __macro(cusparseDnMatSetStridedBatch);  \
  __macro(cusparseCooSetStridedBatch);    \

--- a/paddle/phi/backends/dynload/cusparse.h
+++ b/paddle/phi/backends/dynload/cusparse.h
@@ -76,7 +76,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif

-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
 #define CUSPARSE_ROUTINE_EACH_R3(__macro) \
  __macro(cusparseDnMatSetStridedBatch);  \
  __macro(cusparseCooSetStridedBatch);    \

--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -101,7 +101,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
                                    gpu_type);
  });
  if (batch_size > 1) {
-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
    dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
      phi::dynload::cusparseCsrSetStridedBatch(
          *descriptor, batch_size, M + 1, batch_nnz);
@@ -109,7 +109,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
 #else
    PADDLE_THROW(phi::errors::Unimplemented(
        "Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is "
-        "supported from CUDA 11.7"));
+        "supported from CUDA 11.8"));
 #endif
  }
 }
@@ -155,7 +155,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
  });

  if (batch_size > 1) {
-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
    dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
      phi::dynload::cusparseCooSetStridedBatch(
          *descriptor, batch_size, batch_nnz);
@@ -163,7 +163,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
 #else
    PADDLE_THROW(phi::errors::Unimplemented(
        "Batch Sparse matmul use 'cusparseCooSetStridedBatch', which is "
-        "supported from CUDA 11.7"));
+        "supported from CUDA 11.8"));
 #endif
  }
 }
@@ -241,7 +241,7 @@ class CuSparseDnMatDescriptor {

    PADDLE_ENFORCE_EQ(x.numel(), batch_size * M * N);
    if (batch_size > 1) {
-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
      dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
        phi::dynload::cusparseDnMatSetStridedBatch(
            descriptor_, batch_size, M * N);
@@ -249,7 +249,7 @@ class CuSparseDnMatDescriptor {
 #else
      PADDLE_THROW(phi::errors::Unimplemented(
          "Batch Sparse matmul use 'cusparseDnMatSetStridedBatch', which is "
-          "supported from CUDA 11.7"));
+          "supported from CUDA 11.8"));
 #endif
    }
    VLOG(6) << "Create cusparseDnMatDescr_t " << &descriptor_;

--- a/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
@@ -65,7 +65,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
                                 DenseTensor* dquery,
                                 DenseTensor* dkey,
                                 DenseTensor* dvalue) {
-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
  /* Step1: Forward: softmax{CSR} * value{Dense} -> out{Dense}, reuse */
  SparseCsrTensor dsoftmax;
  MatmulCsrDenseGradKernel<T, Context>(
@@ -129,7 +129,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
  PADDLE_THROW(
      phi::errors::Unimplemented("backward of 'sparse.nn.functional.attention' "
                                 "use 'cusparseCsrSetStridedBatch', which is "
-                                 "completed supported from CUDA 11.7"));
+                                 "completed supported from CUDA 11.8"));
 #endif
 }


--- a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
@@ -99,7 +99,7 @@ void FusedAttentionCsrKernel(
    const paddle::optional<DenseTensor>& attn_mask,
    DenseTensor* out,
    SparseCsrTensor* softmax) {
-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
  /* Check Shape */
  auto q_dim = query.dims();
  auto q_rank = q_dim.size();
@@ -217,7 +217,7 @@ void FusedAttentionCsrKernel(
  PADDLE_THROW(
      phi::errors::Unimplemented("forward of 'sparse.nn.functional.attention' "
                                 "use 'cusparseCsrSetStridedBatch', which is "
-                                 "completed supported from CUDA 11.7"));
+                                 "completed supported from CUDA 11.8"));
 #endif
 }


--- a/python/paddle/fluid/tests/unittests/test_sparse_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_addmm_op.py
@@ -91,8 +91,8 @@ class TestAddmm(unittest.TestCase):
        self.check_result([16, 10], [16, 12], [12, 10], 'csr')

    @unittest.skipIf(
-        not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070,
-        "only support cuda>=11.7",
+        not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
+        "only support cuda>=11.8",
    )
    def test_addmm_3d(self):
        self.check_result([8, 16, 10], [8, 16, 12], [8, 12, 10], 'coo')

--- a/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py
@@ -36,8 +36,8 @@ def get_cuda_version():


 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or get_cuda_version() < 11070,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.7",
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11080,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.8",
 )
 class TestSparseAttentionAPI1(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py
@@ -83,8 +83,8 @@ class TestMatmul(unittest.TestCase):
        self.check_result([16, 12], [12, 10], 'csr')

    @unittest.skipIf(
-        not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070,
-        "only support cuda>=11.7",
+        not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
+        "only support cuda>=11.8",
    )
    def test_matmul_3d(self):
        self.check_result([8, 16, 12], [8, 12, 10], 'coo')
@@ -131,8 +131,8 @@ class TestMaskedMatmul(unittest.TestCase):
        np.testing.assert_allclose(np_y_grad, y.grad.numpy(), rtol=1e-05)

    @unittest.skipIf(
-        not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070,
-        "only support on cuda>=11.7",
+        not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
+        "only support on cuda>=11.8",
    )
    def test_masked_matmul_3d(self):
        paddle.set_default_dtype('float32')

--- a/python/paddle/sparse/nn/functional/transformer.py
+++ b/python/paddle/sparse/nn/functional/transformer.py
@@ -30,7 +30,7 @@ def attention(
 ):
    r"""
    Note:
-        This API is only used from ``CUDA 11.7`` .
+        This API is only used from ``CUDA 11.8`` .

    SparseCsrTensor is used to store the intermediate result of Attention matrix
    in Transformer module, which can reduce memory usage and improve performance.