diff --git a/docs/en/models/Twins.md b/docs/en/models/Twins.md new file mode 100644 index 0000000000000000000000000000000000000000..ccd83e44a47c99ed3c95481c30a682068cb17ff6 --- /dev/null +++ b/docs/en/models/Twins.md @@ -0,0 +1,17 @@ +# Twins + +## Overview +The Twins network includes Twins-PCPVT and Twins-SVT, which focuses on the meticulous design of the spatial attention mechanism, resulting in a simple but more effective solution. Since the architecture only involves matrix multiplication, and the current deep learning framework has a high degree of optimization for matrix multiplication, the architecture is very efficient and easy to implement. Moreover, this architecture can achieve excellent performance in a variety of downstream vision tasks such as image classification, target detection, and semantic segmentation. [Paper](https://arxiv.org/abs/2104.13840). + +## Accuracy, FLOPs and Parameters + +| Models | Top1 | Top5 | Reference
top1 | Reference
top5 | FLOPs
(G) | Params
(M) | +|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +| pcpvt_small | 0.8082 | 0.9552 | 0.812 | - | 3.7 | 24.1 | +| pcpvt_base | 0.8242 | 0.9619 | 0.827 | - | 6.4 | 43.8 | +| pcpvt_large | 0.8273 | 0.9650 | 0.831 | - | 9.5 | 60.9 | +| alt_gvt_small | 0.8140 | 0.9546 | 0.817 | - | 2.8 | 24 | +| alt_gvt_base | 0.8294 | 0.9621 | 0.832 | - | 8.3 | 56 | +| alt_gvt_large | 0.8331 | 0.9642 | 0.837 | - | 14.8 | 99.2 | + +**Note**:The difference in accuracy from Reference is due to the difference in data preprocessing. diff --git a/docs/zh_CN/models/Twins.md b/docs/zh_CN/models/Twins.md index 424f3985df00216c048e026632c43f9e720f4542..143dc6fe7e199e34e3d91a1f0153a70ba96ca932 100644 --- a/docs/zh_CN/models/Twins.md +++ b/docs/zh_CN/models/Twins.md @@ -3,9 +3,9 @@ ## 概述 Twins网络包括Twins-PCPVT和Twins-SVT,其重点对空间注意力机制进行了精心设计,得到了简单却更为有效的方案。由于该体系结构仅涉及矩阵乘法,而目前的深度学习框架中对矩阵乘法有较高的优化程度,因此该体系结构十分高效且易于实现。并且,该体系结构在图像分类、目标检测和语义分割等多种下游视觉任务中都能够取得优异的性能。[论文地址](https://arxiv.org/abs/2104.13840)。 -## 精度、FLOPS和参数量 +## 精度、FLOPs和参数量 -| Models | Top1 | Top5 | Reference
top1 | Reference
top5 | FLOPS
(G) | Params
(M) | +| Models | Top1 | Top5 | Reference
top1 | Reference
top5 | FLOPs
(G) | Params
(M) | |:--:|:--:|:--:|:--:|:--:|:--:|:--:| | pcpvt_small | 0.8082 | 0.9552 | 0.812 | - | 3.7 | 24.1 | | pcpvt_base | 0.8242 | 0.9619 | 0.827 | - | 6.4 | 43.8 | diff --git a/ppcls/arch/backbone/model_zoo/gvt.py b/ppcls/arch/backbone/model_zoo/gvt.py index 810f2b1b72651c3cdb1f87f0b61b474a2b64d9cf..3553073dad8f2110ddaca59a451230c447812bc9 100644 --- a/ppcls/arch/backbone/model_zoo/gvt.py +++ b/ppcls/arch/backbone/model_zoo/gvt.py @@ -82,11 +82,11 @@ class GroupAttention(nn.Layer): B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads ]).transpose([3, 0, 1, 4, 2, 5]) q, k, v = qkv[0], qkv[1], qkv[2] - attn = (q @ k.transpose([0, 1, 2, 4, 3])) * self.scale + attn = paddle.matmul(q, k.transpose([0, 1, 2, 4, 3])) * self.scale attn = nn.Softmax(axis=-1)(attn) attn = self.attn_drop(attn) - attn = (attn @ v).transpose([0, 1, 3, 2, 4]).reshape( + attn = paddle.matmul(attn, v).transpose([0, 1, 3, 2, 4]).reshape( [B, h_group, w_group, self.ws, self.ws, C]) x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C]) @@ -147,11 +147,11 @@ class Attention(nn.Layer): [2, 0, 3, 1, 4]) k, v = kv[0], kv[1] - attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale + attn = paddle.matmul(q, k.transpose([0, 1, 3, 2])) * self.scale attn = nn.Softmax(axis=-1)(attn) attn = self.attn_drop(attn) - x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C]) + x = paddle.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C]) x = self.proj(x) x = self.proj_drop(x) return x