未验证 提交 7648f429 编写于 作者: zhouweiwei2014's avatar zhouweiwei2014 提交者: GitHub

sparse attention kernel is used from 11.8 (#47594)

上级 5fc92943
...@@ -64,7 +64,7 @@ CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) ...@@ -64,7 +64,7 @@ CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif #endif
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11080
#define CUSPARSE_ROUTINE_EACH_R3(__macro) \ #define CUSPARSE_ROUTINE_EACH_R3(__macro) \
__macro(cusparseDnMatSetStridedBatch); \ __macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCooSetStridedBatch); \ __macro(cusparseCooSetStridedBatch); \
......
...@@ -76,7 +76,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) ...@@ -76,7 +76,7 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif #endif
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11080
#define CUSPARSE_ROUTINE_EACH_R3(__macro) \ #define CUSPARSE_ROUTINE_EACH_R3(__macro) \
__macro(cusparseDnMatSetStridedBatch); \ __macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCooSetStridedBatch); \ __macro(cusparseCooSetStridedBatch); \
......
...@@ -101,7 +101,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x, ...@@ -101,7 +101,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
gpu_type); gpu_type);
}); });
if (batch_size > 1) { if (batch_size > 1) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11080
dev_ctx.CusparseCall([&](cusparseHandle_t handle) { dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCsrSetStridedBatch( phi::dynload::cusparseCsrSetStridedBatch(
*descriptor, batch_size, M + 1, batch_nnz); *descriptor, batch_size, M + 1, batch_nnz);
...@@ -109,7 +109,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x, ...@@ -109,7 +109,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
#else #else
PADDLE_THROW(phi::errors::Unimplemented( PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is " "Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is "
"supported from CUDA 11.7")); "supported from CUDA 11.8"));
#endif #endif
} }
} }
...@@ -155,7 +155,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, ...@@ -155,7 +155,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
}); });
if (batch_size > 1) { if (batch_size > 1) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11080
dev_ctx.CusparseCall([&](cusparseHandle_t handle) { dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCooSetStridedBatch( phi::dynload::cusparseCooSetStridedBatch(
*descriptor, batch_size, batch_nnz); *descriptor, batch_size, batch_nnz);
...@@ -163,7 +163,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, ...@@ -163,7 +163,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
#else #else
PADDLE_THROW(phi::errors::Unimplemented( PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'cusparseCooSetStridedBatch', which is " "Batch Sparse matmul use 'cusparseCooSetStridedBatch', which is "
"supported from CUDA 11.7")); "supported from CUDA 11.8"));
#endif #endif
} }
} }
...@@ -241,7 +241,7 @@ class CuSparseDnMatDescriptor { ...@@ -241,7 +241,7 @@ class CuSparseDnMatDescriptor {
PADDLE_ENFORCE_EQ(x.numel(), batch_size * M * N); PADDLE_ENFORCE_EQ(x.numel(), batch_size * M * N);
if (batch_size > 1) { if (batch_size > 1) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11080
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseDnMatSetStridedBatch( phi::dynload::cusparseDnMatSetStridedBatch(
descriptor_, batch_size, M * N); descriptor_, batch_size, M * N);
...@@ -249,7 +249,7 @@ class CuSparseDnMatDescriptor { ...@@ -249,7 +249,7 @@ class CuSparseDnMatDescriptor {
#else #else
PADDLE_THROW(phi::errors::Unimplemented( PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'cusparseDnMatSetStridedBatch', which is " "Batch Sparse matmul use 'cusparseDnMatSetStridedBatch', which is "
"supported from CUDA 11.7")); "supported from CUDA 11.8"));
#endif #endif
} }
VLOG(6) << "Create cusparseDnMatDescr_t " << &descriptor_; VLOG(6) << "Create cusparseDnMatDescr_t " << &descriptor_;
......
...@@ -65,7 +65,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx, ...@@ -65,7 +65,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
DenseTensor* dquery, DenseTensor* dquery,
DenseTensor* dkey, DenseTensor* dkey,
DenseTensor* dvalue) { DenseTensor* dvalue) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11080
/* Step1: Forward: softmax{CSR} * value{Dense} -> out{Dense}, reuse */ /* Step1: Forward: softmax{CSR} * value{Dense} -> out{Dense}, reuse */
SparseCsrTensor dsoftmax; SparseCsrTensor dsoftmax;
MatmulCsrDenseGradKernel<T, Context>( MatmulCsrDenseGradKernel<T, Context>(
...@@ -129,7 +129,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx, ...@@ -129,7 +129,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
PADDLE_THROW( PADDLE_THROW(
phi::errors::Unimplemented("backward of 'sparse.nn.functional.attention' " phi::errors::Unimplemented("backward of 'sparse.nn.functional.attention' "
"use 'cusparseCsrSetStridedBatch', which is " "use 'cusparseCsrSetStridedBatch', which is "
"completed supported from CUDA 11.7")); "completed supported from CUDA 11.8"));
#endif #endif
} }
......
...@@ -99,7 +99,7 @@ void FusedAttentionCsrKernel( ...@@ -99,7 +99,7 @@ void FusedAttentionCsrKernel(
const paddle::optional<DenseTensor>& attn_mask, const paddle::optional<DenseTensor>& attn_mask,
DenseTensor* out, DenseTensor* out,
SparseCsrTensor* softmax) { SparseCsrTensor* softmax) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11080
/* Check Shape */ /* Check Shape */
auto q_dim = query.dims(); auto q_dim = query.dims();
auto q_rank = q_dim.size(); auto q_rank = q_dim.size();
...@@ -217,7 +217,7 @@ void FusedAttentionCsrKernel( ...@@ -217,7 +217,7 @@ void FusedAttentionCsrKernel(
PADDLE_THROW( PADDLE_THROW(
phi::errors::Unimplemented("forward of 'sparse.nn.functional.attention' " phi::errors::Unimplemented("forward of 'sparse.nn.functional.attention' "
"use 'cusparseCsrSetStridedBatch', which is " "use 'cusparseCsrSetStridedBatch', which is "
"completed supported from CUDA 11.7")); "completed supported from CUDA 11.8"));
#endif #endif
} }
......
...@@ -91,8 +91,8 @@ class TestAddmm(unittest.TestCase): ...@@ -91,8 +91,8 @@ class TestAddmm(unittest.TestCase):
self.check_result([16, 10], [16, 12], [12, 10], 'csr') self.check_result([16, 10], [16, 12], [12, 10], 'csr')
@unittest.skipIf( @unittest.skipIf(
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070, not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
"only support cuda>=11.7", "only support cuda>=11.8",
) )
def test_addmm_3d(self): def test_addmm_3d(self):
self.check_result([8, 16, 10], [8, 16, 12], [8, 12, 10], 'coo') self.check_result([8, 16, 10], [8, 16, 12], [8, 12, 10], 'coo')
......
...@@ -36,8 +36,8 @@ def get_cuda_version(): ...@@ -36,8 +36,8 @@ def get_cuda_version():
@unittest.skipIf( @unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11070, not core.is_compiled_with_cuda() or get_cuda_version() < 11080,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.7", "core is not compiled with CUDA and cuda version need larger than or equal to 11.8",
) )
class TestSparseAttentionAPI1(unittest.TestCase): class TestSparseAttentionAPI1(unittest.TestCase):
def setUp(self): def setUp(self):
......
...@@ -83,8 +83,8 @@ class TestMatmul(unittest.TestCase): ...@@ -83,8 +83,8 @@ class TestMatmul(unittest.TestCase):
self.check_result([16, 12], [12, 10], 'csr') self.check_result([16, 12], [12, 10], 'csr')
@unittest.skipIf( @unittest.skipIf(
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070, not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
"only support cuda>=11.7", "only support cuda>=11.8",
) )
def test_matmul_3d(self): def test_matmul_3d(self):
self.check_result([8, 16, 12], [8, 12, 10], 'coo') self.check_result([8, 16, 12], [8, 12, 10], 'coo')
...@@ -131,8 +131,8 @@ class TestMaskedMatmul(unittest.TestCase): ...@@ -131,8 +131,8 @@ class TestMaskedMatmul(unittest.TestCase):
np.testing.assert_allclose(np_y_grad, y.grad.numpy(), rtol=1e-05) np.testing.assert_allclose(np_y_grad, y.grad.numpy(), rtol=1e-05)
@unittest.skipIf( @unittest.skipIf(
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070, not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080,
"only support on cuda>=11.7", "only support on cuda>=11.8",
) )
def test_masked_matmul_3d(self): def test_masked_matmul_3d(self):
paddle.set_default_dtype('float32') paddle.set_default_dtype('float32')
......
...@@ -30,7 +30,7 @@ def attention( ...@@ -30,7 +30,7 @@ def attention(
): ):
r""" r"""
Note: Note:
This API is only used from ``CUDA 11.7`` . This API is only used from ``CUDA 11.8`` .
SparseCsrTensor is used to store the intermediate result of Attention matrix SparseCsrTensor is used to store the intermediate result of Attention matrix
in Transformer module, which can reduce memory usage and improve performance. in Transformer module, which can reduce memory usage and improve performance.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册