diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index c487313f91c5882220a049f26eb916535b6856cf..b910b4ec73901b9430e31f3e4b06b25c04c7be91 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -94,7 +94,7 @@ if (WITH_GPU OR WITH_ROCM) endif() op_library(sync_batch_norm_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") - if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) ) + if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) ) op_library(sparse_attention_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n") endif() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 0c2731bc45258dd8b7dd456db67aa23e34f66d0f..9d6a1d00cff604764726cc7018b6c490943d1922 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -464,6 +464,11 @@ list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while) # disable this unittest temporarily list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception) +# disable sparse_attention which not in suitable env +if ( (NOT WITH_GPU) OR (WIN32) OR (PADDLE_WITH_ARM) OR (WITH_ROCM) ) + list(REMOVE_ITEM TEST_OPS test_sparse_attention_op) +endif() + if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_dataset) list(REMOVE_ITEM TEST_OPS test_dataset_dataloader) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py index 48401fb55ef3f568b4db4edd019f4b7375c032a6..5134b885f330727d1c7f89e473f1fe085a7c028e 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py @@ -16,10 +16,13 @@ import unittest import numpy as np from op_test import OpTest import paddle.fluid.core as core +from paddle.static import Program, program_guard import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.nn.functional as F import os import re -import platform def get_cuda_version(): @@ -34,22 +37,6 @@ def get_cuda_version(): return -1 -def get_linux_platform(): - if platform.system().lower() == 'windows': - return 0 - elif platform.system().lower() == 'linux': - return 1 - else: - return -1 - - -def get_suitable_env(): - if get_cuda_version() >= 11020 and get_linux_platform() == 1: - return True - else: - return False - - def softmax(x): max = np.max(x, axis=1, keepdims=True) e_x = np.exp(x - max) @@ -141,8 +128,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize): @unittest.skipIf( - not core.is_compiled_with_cuda() or get_suitable_env() == False, - "core is not compiled with CUDA and cuda version need >= 11.2 in windows") + not core.is_compiled_with_cuda() or get_cuda_version() < 11020, + "core is not compiled with CUDA and cuda version need larger than or equal to 11.2" +) class TestSparseAttentionOp(OpTest): def config(self): self.shape = (1, 1, 16, 8) @@ -201,5 +189,130 @@ class TestSparseAttentionOpShapeTest(TestSparseAttentionOp): self.dtype = "float64" +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_cuda_version() < 11020, + "core is not compiled with CUDA and cuda version need larger than or equal to 11.2" +) +class TestSparseAttentionAPI(unittest.TestCase): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (1, 1, 8, 4) + self.blocksize = 2 + self.dtype = 'float64' + + def test_static_graph(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + Q = paddle.static.data(name="Q", shape=self.shape, dtype=self.dtype) + K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype) + V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype) + + batch_size, num_heads, rows = self.shape[0], self.shape[ + 1], self.shape[2] + block_num = rows / self.blocksize + block_last = rows % self.blocksize + sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last + offset_shape = (batch_size, num_heads, rows + 1) + columns_shape = (batch_size, num_heads, int(sparse_nnz_num)) + + offset = paddle.static.data( + name="Offset", shape=offset_shape, dtype="int32") + columns = paddle.static.data( + name="Columns", shape=columns_shape, dtype="int32") + Out = F.sparse_attention(Q, K, V, offset, columns) + + Q_np = np.random.random(self.shape).astype(self.dtype) + K_np = np.random.random(self.shape).astype(self.dtype) + V_np = np.random.random(self.shape).astype(self.dtype) + offset_np, columns_np = init_csr_format( + self.shape[0], self.shape[1], self.shape[2], self.blocksize) + offset_np = offset_np.astype('int32') + columns_np = columns_np.astype('int32') + + exe = fluid.Executor(self.place) + fetches_result = exe.run(feed={ + "Q": Q_np, + "K": K_np, + "V": V_np, + "Offset": offset_np, + "Columns": columns_np + }, + fetch_list=[Out]) + expected_result, __, __ = ref_batch_sparse_attention( + Q_np, K_np, V_np, offset_np, columns_np) + + self.assertTrue( + np.allclose( + fetches_result, expected_result, atol=1e-5)) + + def test_dygraph(self): + paddle.disable_static() + offset, columns = init_csr_format(self.shape[0], self.shape[1], + self.shape[2], self.blocksize) + offset = offset.astype('int32') + columns = columns.astype('int32') + query = np.random.random(self.shape).astype(self.dtype) + key = np.random.random(self.shape).astype(self.dtype) + value = np.random.random(self.shape).astype(self.dtype) + + paddle_query = paddle.to_tensor(query, place=self.place) + paddle_key = paddle.to_tensor(key, place=self.place) + paddle_value = paddle.to_tensor(value, place=self.place) + paddle_offset = paddle.to_tensor(offset, place=self.place) + paddle_colunmns = paddle.to_tensor(columns, place=self.place) + + paddle_result = F.sparse_attention(paddle_query, paddle_key, + paddle_value, paddle_offset, + paddle_colunmns) + + numpy_result, __, __ = ref_batch_sparse_attention(query, key, value, + offset, columns) + numpy_result = numpy_result.astype(self.dtype) + + self.assertTrue( + np.allclose( + paddle_result.numpy(), numpy_result, atol=1e-5)) + + +class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 2, 8, 4) + self.blocksize = 2 + self.dtype = 'float32' + + +class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 2, 64, 32) + self.blocksize = 2 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 1, 64, 32) + self.blocksize = 2 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (4, 4, 128, 32) + self.blocksize = 8 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (3, 3, 35, 15) + self.blocksize = 3 + self.dtype = 'float64' + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 7965b362b9c55aa7d09711b2639fef789dfbf454..4151f25b94aff2ee005c6d759cf11e07db397f8e 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -112,6 +112,8 @@ from .input import embedding # noqa: F401 from ...fluid.layers import gather_tree # noqa: F401 from ...fluid.layers import temporal_shift # noqa: F401 +from .sparse_attention import sparse_attention + __all__ = [ #noqa 'conv1d', 'conv1d_transpose', @@ -207,4 +209,5 @@ __all__ = [ #noqa 'layer_norm', 'instance_norm', 'class_center_sample', + 'sparse_attention', ] diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..f57669f11457f6f1246952fee733523e2b6f4230 --- /dev/null +++ b/python/paddle/nn/functional/sparse_attention.py @@ -0,0 +1,144 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +import paddle +from ...fluid.framework import in_dygraph_mode, default_main_program +from paddle.fluid.layer_helper import LayerHelper +from ...fluid.framework import in_dygraph_mode +from paddle import _C_ops + + +def sparse_attention(query, + key, + value, + sparse_csr_offset, + sparse_csr_columns, + name=None): + r""" + This operator sparsify the Attention matrix in Transformer module + to achieve the effect of reducing memory consumption and computation. + The sparse layout is expressed in CSR format and contains two parameters, + ``offset`` and ``columns``. + + .. math:: + + result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V + + where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. + The dimensions of the three parameters are the same. + ``d`` represents the size of the last dimension of the three parameters. + + Parameters: + query(Tensor): The query tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + key(Tensor): The key tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + value(Tensor): The value tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + sparse_csr_offset(Tensor): The sparsity feature in the Attention module + is expressed in the CSR format, and the offset represents + the number of non-zero elements in each row of the matrix. + It's a 3-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len + 1]`. + The dtype should be ``int32``. + sparse_csr_columns(Tensor): The sparsity feature in the Attention module + is expressed in the CSR format, and the columns represent + the column index values of non-zero elements in the matrix. + It's a 3-D tensor with a shape of + :math:`[batch\_size, num\_heads, sparse\_nnz]`. + The dtype should be ``int32``. + name(str, optional): The default value is None. Normally there is no need for user + to set this property. For more information, please refer to + :ref:`api_guide_Name`. + + Returns: + A Tensor which refers to the result in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + + Examples: + .. code-block:: python + + # required: skiptest + import paddle + import numpy as np + + query_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + key_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + value_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + sparse_csr_offset_data = np.array([[[0, 2, + 4, 6, 8]]]).astype("int32") + sparse_csr_columns_data = np.array([[[0, 1, + 0, 1, 2, 3, 2, 3]]]).astype("int32") + print(query_data.shape) + # (1, 1, 4, 2) + print(sparse_csr_offset_data.shape) + # (1, 1, 5) + print(sparse_csr_columns_data.shape) + # (1, 1, 8) + paddle.disable_static() + query = paddle.to_tensor(query_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + key = paddle.to_tensor(key_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + value = paddle.to_tensor(value_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + output = paddle.nn.functional.sparse_attention(query, key, + value, offset, columns) + print(output) + + # [[[[1.60885942, 2.60885954], + # [1.99830270, 2.99830270], + # [1.60885942, 2.60885954], + # [1.99830270, 2.99830270]]]] + """ + if in_dygraph_mode(): + result_attention, result_sdd, result_softmax = _C_ops.sparse_attention( + query, key, value, sparse_csr_offset, sparse_csr_columns) + return result_attention + + helper = LayerHelper('sparse_attention', **locals()) + dtype = helper.input_dtype(input_param_name='Q') + out = helper.create_variable_for_type_inference(dtype) + result_sdd = helper.create_variable_for_type_inference(dtype) + result_softmax = helper.create_variable_for_type_inference(dtype) + inputs = { + 'Q': query, + 'K': key, + 'V': value, + 'Offset': sparse_csr_offset, + 'Columns': sparse_csr_columns + } + outputs = { + 'Out': out, + 'SparseDotSdd': result_sdd, + 'Softmax': result_softmax + } + helper.append_op(type='sparse_attention', inputs=inputs, outputs=outputs) + return out