未验证 提交 7648f429 编写于 作者: zhouweiwei2014's avatar zhouweiwei2014 提交者: GitHub

sparse attention kernel is used from 11.8 (#47594)

上级 5fc92943
......@@ -64,7 +64,7 @@ CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#if CUDA_VERSION >= 11070
#if CUDA_VERSION >= 11080
#define CUSPARSE_ROUTINE_EACH_R3(__macro) \
__macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCooSetStridedBatch); \
......
......@@ -76,7 +76,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#if CUDA_VERSION >= 11070
#if CUDA_VERSION >= 11080
#define CUSPARSE_ROUTINE_EACH_R3(__macro) \
__macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCooSetStridedBatch); \
......
......@@ -101,7 +101,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
gpu_type);
});
if (batch_size > 1) {
#if CUDA_VERSION >= 11070
#if CUDA_VERSION >= 11080
dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCsrSetStridedBatch(
*descriptor, batch_size, M + 1, batch_nnz);
......@@ -109,7 +109,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
#else
PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is "
"supported from CUDA 11.7"));
"supported from CUDA 11.8"));
#endif
}
}
......@@ -155,7 +155,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
});
if (batch_size > 1) {
#if CUDA_VERSION >= 11070
#if CUDA_VERSION >= 11080
dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCooSetStridedBatch(
*descriptor, batch_size, batch_nnz);
......@@ -163,7 +163,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
#else
PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'cusparseCooSetStridedBatch', which is "
"supported from CUDA 11.7"));
"supported from CUDA 11.8"));
#endif
}
}
......@@ -241,7 +241,7 @@ class CuSparseDnMatDescriptor {
PADDLE_ENFORCE_EQ(x.numel(), batch_size * M * N);
if (batch_size > 1) {
#if CUDA_VERSION >= 11070
#if CUDA_VERSION >= 11080
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseDnMatSetStridedBatch(
descriptor_, batch_size, M * N);
......@@ -249,7 +249,7 @@ class CuSparseDnMatDescriptor {
#else
PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'cusparseDnMatSetStridedBatch', which is "
"supported from CUDA 11.7"));
"supported from CUDA 11.8"));
#endif
}
VLOG(6) << "Create cusparseDnMatDescr_t " << &descriptor_;
......
......@@ -65,7 +65,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
DenseTensor* dquery,
DenseTensor* dkey,
DenseTensor* dvalue) {
#if CUDA_VERSION >= 11070
#if CUDA_VERSION >= 11080
/* Step1: Forward: softmax{CSR} * value{Dense} -> out{Dense}, reuse */
SparseCsrTensor dsoftmax;
MatmulCsrDenseGradKernel<T, Context>(
......@@ -129,7 +129,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
PADDLE_THROW(
phi::errors::Unimplemented("backward of 'sparse.nn.functional.attention' "
"use 'cusparseCsrSetStridedBatch', which is "
"completed supported from CUDA 11.7"));
"completed supported from CUDA 11.8"));
#endif
}
......
......@@ -99,7 +99,7 @@ void FusedAttentionCsrKernel(
const paddle::optional<DenseTensor>& attn_mask,
DenseTensor* out,
SparseCsrTensor* softmax) {
#if CUDA_VERSION >= 11070
#if CUDA_VERSION >= 11080
/* Check Shape */
auto q_dim = query.dims();
auto q_rank = q_dim.size();
......@@ -217,7 +217,7 @@ void FusedAttentionCsrKernel(
PADDLE_THROW(
phi::errors::Unimplemented("forward of 'sparse.nn.functional.attention' "
"use 'cusparseCsrSetStridedBatch', which is "
"completed supported from CUDA 11.7"));
"completed supported from CUDA 11.8"));
#endif
}
......
......@@ -91,8 +91,8 @@ class TestAddmm(unittest.TestCase):
self.check_result([16, 10], [16, 12], [12, 10], 'csr')
@unittest.skipIf(
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070,
"only support cuda>=11.7",
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
"only support cuda>=11.8",
)
def test_addmm_3d(self):
self.check_result([8, 16, 10], [8, 16, 12], [8, 12, 10], 'coo')
......
......@@ -36,8 +36,8 @@ def get_cuda_version():
@unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11070,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.7",
not core.is_compiled_with_cuda() or get_cuda_version() < 11080,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.8",
)
class TestSparseAttentionAPI1(unittest.TestCase):
def setUp(self):
......
......@@ -83,8 +83,8 @@ class TestMatmul(unittest.TestCase):
self.check_result([16, 12], [12, 10], 'csr')
@unittest.skipIf(
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070,
"only support cuda>=11.7",
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
"only support cuda>=11.8",
)
def test_matmul_3d(self):
self.check_result([8, 16, 12], [8, 12, 10], 'coo')
......@@ -131,8 +131,8 @@ class TestMaskedMatmul(unittest.TestCase):
np.testing.assert_allclose(np_y_grad, y.grad.numpy(), rtol=1e-05)
@unittest.skipIf(
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070,
"only support on cuda>=11.7",
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
"only support on cuda>=11.8",
)
def test_masked_matmul_3d(self):
paddle.set_default_dtype('float32')
......
......@@ -30,7 +30,7 @@ def attention(
):
r"""
Note:
This API is only used from ``CUDA 11.7`` .
This API is only used from ``CUDA 11.8`` .
SparseCsrTensor is used to store the intermediate result of Attention matrix
in Transformer module, which can reduce memory usage and improve performance.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册