Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleClas
提交
9e2a8e86
P
PaddleClas
项目概览
PaddlePaddle
/
PaddleClas
大约 2 年 前同步成功
通知
118
Star
4999
Fork
1114
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
6
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleClas
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
6
合并请求
6
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9e2a8e86
编写于
6月 10, 2021
作者:
G
gaotingquan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add LeViT and Twins
上级
f4f09840
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
1195 addition
and
4 deletion
+1195
-4
ppcls/arch/backbone/__init__.py
ppcls/arch/backbone/__init__.py
+3
-1
ppcls/arch/backbone/model_zoo/gvt.py
ppcls/arch/backbone/model_zoo/gvt.py
+659
-0
ppcls/arch/backbone/model_zoo/levit.py
ppcls/arch/backbone/model_zoo/levit.py
+515
-0
ppcls/arch/backbone/model_zoo/vision_transformer.py
ppcls/arch/backbone/model_zoo/vision_transformer.py
+18
-3
未找到文件。
ppcls/arch/backbone/__init__.py
浏览文件 @
9e2a8e86
...
@@ -47,4 +47,6 @@ from ppcls.arch.backbone.model_zoo.distillation_models import ResNet50_vd_distil
...
@@ -47,4 +47,6 @@ from ppcls.arch.backbone.model_zoo.distillation_models import ResNet50_vd_distil
from
ppcls.arch.backbone.model_zoo.swin_transformer
import
SwinTransformer_tiny_patch4_window7_224
,
SwinTransformer_small_patch4_window7_224
,
SwinTransformer_base_patch4_window7_224
,
SwinTransformer_base_patch4_window12_384
,
SwinTransformer_large_patch4_window7_224
,
SwinTransformer_large_patch4_window12_384
from
ppcls.arch.backbone.model_zoo.swin_transformer
import
SwinTransformer_tiny_patch4_window7_224
,
SwinTransformer_small_patch4_window7_224
,
SwinTransformer_base_patch4_window7_224
,
SwinTransformer_base_patch4_window12_384
,
SwinTransformer_large_patch4_window7_224
,
SwinTransformer_large_patch4_window12_384
from
ppcls.arch.backbone.model_zoo.mixnet
import
MixNet_S
,
MixNet_M
,
MixNet_L
from
ppcls.arch.backbone.model_zoo.mixnet
import
MixNet_S
,
MixNet_M
,
MixNet_L
from
ppcls.arch.backbone.model_zoo.rexnet
import
ReXNet_1_0
,
ReXNet_1_3
,
ReXNet_1_5
,
ReXNet_2_0
,
ReXNet_3_0
from
ppcls.arch.backbone.model_zoo.rexnet
import
ReXNet_1_0
,
ReXNet_1_3
,
ReXNet_1_5
,
ReXNet_2_0
,
ReXNet_3_0
from
ppcls.arch.backbone.variant_models.resnet_variant
import
ResNet50_last_stage_stride1
from
ppcls.arch.backbone.model_zoo.gvt
import
pcpvt_small
,
pcpvt_base
,
pcpvt_large
,
alt_gvt_small
,
alt_gvt_base
,
alt_gvt_large
from
ppcls.arch.backbone.model_zoo.levit
import
LeViT_128S
,
LeViT_128
,
LeViT_192
,
LeViT_256
,
LeViT_384
from
ppcls.arch.backbone.variant_models.resnet_variant
import
ResNet50_last_stage_stride1
\ No newline at end of file
ppcls/arch/backbone/model_zoo/gvt.py
0 → 100644
浏览文件 @
9e2a8e86
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
functools
import
partial
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
paddle.regularizer
import
L2Decay
from
.vision_transformer
import
trunc_normal_
,
normal_
,
zeros_
,
ones_
,
to_2tuple
,
DropPath
,
Identity
,
Mlp
from
.vision_transformer
import
Block
as
ViTBlock
__all__
=
[
"CPVTV2"
,
"PCPVT"
,
"ALTGVT"
,
"pcpvt_small"
,
"pcpvt_base"
,
"pcpvt_large"
,
"alt_gvt_small"
,
"alt_gvt_base"
,
"alt_gvt_large"
]
class
GroupAttention
(
nn
.
Layer
):
"""LSA: self attention within a group.
"""
def
__init__
(
self
,
dim
,
num_heads
=
8
,
qkv_bias
=
False
,
qk_scale
=
None
,
attn_drop
=
0.
,
proj_drop
=
0.
,
ws
=
1
):
super
().
__init__
()
if
ws
==
1
:
raise
Exception
(
f
"ws
{
ws
}
should not be 1"
)
if
dim
%
num_heads
!=
0
:
raise
Exception
(
f
"dim
{
dim
}
should be divided by num_heads
{
num_heads
}
."
)
self
.
dim
=
dim
self
.
num_heads
=
num_heads
head_dim
=
dim
//
num_heads
self
.
scale
=
qk_scale
or
head_dim
**-
0.5
self
.
qkv
=
nn
.
Linear
(
dim
,
dim
*
3
,
bias_attr
=
qkv_bias
)
self
.
attn_drop
=
nn
.
Dropout
(
attn_drop
)
self
.
proj
=
nn
.
Linear
(
dim
,
dim
)
self
.
proj_drop
=
nn
.
Dropout
(
proj_drop
)
self
.
ws
=
ws
def
forward
(
self
,
x
,
H
,
W
):
B
,
N
,
C
=
x
.
shape
h_group
,
w_group
=
H
//
self
.
ws
,
W
//
self
.
ws
total_groups
=
h_group
*
w_group
x
=
x
.
reshape
([
B
,
h_group
,
self
.
ws
,
w_group
,
self
.
ws
,
C
]).
transpose
(
[
0
,
1
,
3
,
2
,
4
,
5
])
qkv
=
self
.
qkv
(
x
).
reshape
(
[
B
,
total_groups
,
-
1
,
3
,
self
.
num_heads
,
C
//
self
.
num_heads
]).
transpose
([
3
,
0
,
1
,
4
,
2
,
5
])
q
,
k
,
v
=
qkv
[
0
],
qkv
[
1
],
qkv
[
2
]
attn
=
(
q
@
k
.
transpose
([
0
,
1
,
2
,
4
,
3
]))
*
self
.
scale
attn
=
nn
.
Softmax
(
axis
=-
1
)(
attn
)
attn
=
self
.
attn_drop
(
attn
)
attn
=
(
attn
@
v
).
transpose
([
0
,
1
,
3
,
2
,
4
]).
reshape
(
[
B
,
h_group
,
w_group
,
self
.
ws
,
self
.
ws
,
C
])
x
=
attn
.
transpose
([
0
,
1
,
3
,
2
,
4
,
5
]).
reshape
([
B
,
N
,
C
])
x
=
self
.
proj
(
x
)
x
=
self
.
proj_drop
(
x
)
return
x
class
Attention
(
nn
.
Layer
):
"""GSA: using a key to summarize the information for a group to be efficient.
"""
def
__init__
(
self
,
dim
,
num_heads
=
8
,
qkv_bias
=
False
,
qk_scale
=
None
,
attn_drop
=
0.
,
proj_drop
=
0.
,
sr_ratio
=
1
):
super
().
__init__
()
assert
dim
%
num_heads
==
0
,
f
"dim
{
dim
}
should be divided by num_heads
{
num_heads
}
."
self
.
dim
=
dim
self
.
num_heads
=
num_heads
head_dim
=
dim
//
num_heads
self
.
scale
=
qk_scale
or
head_dim
**-
0.5
self
.
q
=
nn
.
Linear
(
dim
,
dim
,
bias_attr
=
qkv_bias
)
self
.
kv
=
nn
.
Linear
(
dim
,
dim
*
2
,
bias_attr
=
qkv_bias
)
self
.
attn_drop
=
nn
.
Dropout
(
attn_drop
)
self
.
proj
=
nn
.
Linear
(
dim
,
dim
)
self
.
proj_drop
=
nn
.
Dropout
(
proj_drop
)
self
.
sr_ratio
=
sr_ratio
if
sr_ratio
>
1
:
self
.
sr
=
nn
.
Conv2D
(
dim
,
dim
,
kernel_size
=
sr_ratio
,
stride
=
sr_ratio
)
self
.
norm
=
nn
.
LayerNorm
(
dim
)
def
forward
(
self
,
x
,
H
,
W
):
B
,
N
,
C
=
x
.
shape
q
=
self
.
q
(
x
).
reshape
(
[
B
,
N
,
self
.
num_heads
,
C
//
self
.
num_heads
]).
transpose
(
[
0
,
2
,
1
,
3
])
if
self
.
sr_ratio
>
1
:
x_
=
x
.
transpose
([
0
,
2
,
1
]).
reshape
([
B
,
C
,
H
,
W
])
x_
=
self
.
sr
(
x_
).
reshape
([
B
,
C
,
-
1
]).
transpose
([
0
,
2
,
1
])
x_
=
self
.
norm
(
x_
)
kv
=
self
.
kv
(
x_
).
reshape
(
[
B
,
-
1
,
2
,
self
.
num_heads
,
C
//
self
.
num_heads
]).
transpose
(
[
2
,
0
,
3
,
1
,
4
])
else
:
kv
=
self
.
kv
(
x
).
reshape
(
[
B
,
-
1
,
2
,
self
.
num_heads
,
C
//
self
.
num_heads
]).
transpose
(
[
2
,
0
,
3
,
1
,
4
])
k
,
v
=
kv
[
0
],
kv
[
1
]
attn
=
(
q
@
k
.
transpose
([
0
,
1
,
3
,
2
]))
*
self
.
scale
attn
=
nn
.
Softmax
(
axis
=-
1
)(
attn
)
attn
=
self
.
attn_drop
(
attn
)
x
=
(
attn
@
v
).
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
B
,
N
,
C
])
x
=
self
.
proj
(
x
)
x
=
self
.
proj_drop
(
x
)
return
x
class
Block
(
nn
.
Layer
):
def
__init__
(
self
,
dim
,
num_heads
,
mlp_ratio
=
4.
,
qkv_bias
=
False
,
qk_scale
=
None
,
drop
=
0.
,
attn_drop
=
0.
,
drop_path
=
0.
,
act_layer
=
nn
.
GELU
,
norm_layer
=
nn
.
LayerNorm
,
sr_ratio
=
1
):
super
().
__init__
()
self
.
norm1
=
norm_layer
(
dim
)
self
.
attn
=
Attention
(
dim
,
num_heads
=
num_heads
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
attn_drop
=
attn_drop
,
proj_drop
=
drop
,
sr_ratio
=
sr_ratio
)
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
else
Identity
()
self
.
norm2
=
norm_layer
(
dim
)
mlp_hidden_dim
=
int
(
dim
*
mlp_ratio
)
self
.
mlp
=
Mlp
(
in_features
=
dim
,
hidden_features
=
mlp_hidden_dim
,
act_layer
=
act_layer
,
drop
=
drop
)
def
forward
(
self
,
x
,
H
,
W
):
x
=
x
+
self
.
drop_path
(
self
.
attn
(
self
.
norm1
(
x
),
H
,
W
))
x
=
x
+
self
.
drop_path
(
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
class
SBlock
(
ViTBlock
):
def
__init__
(
self
,
dim
,
num_heads
,
mlp_ratio
=
4.
,
qkv_bias
=
False
,
qk_scale
=
None
,
drop
=
0.
,
attn_drop
=
0.
,
drop_path
=
0.
,
act_layer
=
nn
.
GELU
,
norm_layer
=
nn
.
LayerNorm
,
sr_ratio
=
1
):
super
().
__init__
(
dim
,
num_heads
,
mlp_ratio
,
qkv_bias
,
qk_scale
,
drop
,
attn_drop
,
drop_path
,
act_layer
,
norm_layer
)
def
forward
(
self
,
x
,
H
,
W
):
return
super
().
forward
(
x
)
class
GroupBlock
(
ViTBlock
):
def
__init__
(
self
,
dim
,
num_heads
,
mlp_ratio
=
4.
,
qkv_bias
=
False
,
qk_scale
=
None
,
drop
=
0.
,
attn_drop
=
0.
,
drop_path
=
0.
,
act_layer
=
nn
.
GELU
,
norm_layer
=
nn
.
LayerNorm
,
sr_ratio
=
1
,
ws
=
1
):
super
().
__init__
(
dim
,
num_heads
,
mlp_ratio
,
qkv_bias
,
qk_scale
,
drop
,
attn_drop
,
drop_path
,
act_layer
,
norm_layer
)
del
self
.
attn
if
ws
==
1
:
self
.
attn
=
Attention
(
dim
,
num_heads
,
qkv_bias
,
qk_scale
,
attn_drop
,
drop
,
sr_ratio
)
else
:
self
.
attn
=
GroupAttention
(
dim
,
num_heads
,
qkv_bias
,
qk_scale
,
attn_drop
,
drop
,
ws
)
def
forward
(
self
,
x
,
H
,
W
):
x
=
x
+
self
.
drop_path
(
self
.
attn
(
self
.
norm1
(
x
),
H
,
W
))
x
=
x
+
self
.
drop_path
(
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
class
PatchEmbed
(
nn
.
Layer
):
""" Image to Patch Embedding.
"""
def
__init__
(
self
,
img_size
=
224
,
patch_size
=
16
,
in_chans
=
3
,
embed_dim
=
768
):
super
().
__init__
()
if
img_size
%
patch_size
!=
0
:
raise
Exception
(
f
"img_size
{
img_size
}
should be divided by patch_size
{
patch_size
}
."
)
img_size
=
to_2tuple
(
img_size
)
patch_size
=
to_2tuple
(
patch_size
)
self
.
img_size
=
img_size
self
.
patch_size
=
patch_size
self
.
H
,
self
.
W
=
img_size
[
0
]
//
patch_size
[
0
],
img_size
[
1
]
//
patch_size
[
1
]
self
.
num_patches
=
self
.
H
*
self
.
W
self
.
proj
=
nn
.
Conv2D
(
in_chans
,
embed_dim
,
kernel_size
=
patch_size
,
stride
=
patch_size
)
self
.
norm
=
nn
.
LayerNorm
(
embed_dim
)
def
forward
(
self
,
x
):
B
,
C
,
H
,
W
=
x
.
shape
x
=
self
.
proj
(
x
).
flatten
(
2
).
transpose
([
0
,
2
,
1
])
x
=
self
.
norm
(
x
)
H
,
W
=
H
//
self
.
patch_size
[
0
],
W
//
self
.
patch_size
[
1
]
return
x
,
(
H
,
W
)
# borrow from PVT https://github.com/whai362/PVT.git
class
PyramidVisionTransformer
(
nn
.
Layer
):
def
__init__
(
self
,
img_size
=
224
,
patch_size
=
16
,
in_chans
=
3
,
num_classes
=
1000
,
embed_dims
=
[
64
,
128
,
256
,
512
],
num_heads
=
[
1
,
2
,
4
,
8
],
mlp_ratios
=
[
4
,
4
,
4
,
4
],
qkv_bias
=
False
,
qk_scale
=
None
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.
,
norm_layer
=
nn
.
LayerNorm
,
depths
=
[
3
,
4
,
6
,
3
],
sr_ratios
=
[
8
,
4
,
2
,
1
],
block_cls
=
Block
):
super
().
__init__
()
self
.
num_classes
=
num_classes
self
.
depths
=
depths
# patch_embed
self
.
patch_embeds
=
nn
.
LayerList
()
self
.
pos_embeds
=
nn
.
ParameterList
()
self
.
pos_drops
=
nn
.
LayerList
()
self
.
blocks
=
nn
.
LayerList
()
for
i
in
range
(
len
(
depths
)):
if
i
==
0
:
self
.
patch_embeds
.
append
(
PatchEmbed
(
img_size
,
patch_size
,
in_chans
,
embed_dims
[
i
]))
else
:
self
.
patch_embeds
.
append
(
PatchEmbed
(
img_size
//
patch_size
//
2
**
(
i
-
1
),
2
,
embed_dims
[
i
-
1
],
embed_dims
[
i
]))
patch_num
=
self
.
patch_embeds
[
i
].
num_patches
+
1
if
i
==
len
(
embed_dims
)
-
1
else
self
.
patch_embeds
[
i
].
num_patches
self
.
pos_embeds
.
append
(
self
.
create_parameter
(
shape
=
[
1
,
patch_num
,
embed_dims
[
i
]],
default_initializer
=
zeros_
))
self
.
add_parameter
(
f
"pos_embeds_
{
i
}
"
,
self
.
pos_embeds
[
i
])
self
.
pos_drops
.
append
(
nn
.
Dropout
(
p
=
drop_rate
))
dpr
=
[
x
.
numpy
()[
0
]
for
x
in
paddle
.
linspace
(
0
,
drop_path_rate
,
sum
(
depths
))
]
# stochastic depth decay rule
cur
=
0
for
k
in
range
(
len
(
depths
)):
_block
=
nn
.
LayerList
([
block_cls
(
dim
=
embed_dims
[
k
],
num_heads
=
num_heads
[
k
],
mlp_ratio
=
mlp_ratios
[
k
],
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
drop
=
drop_rate
,
attn_drop
=
attn_drop_rate
,
drop_path
=
dpr
[
cur
+
i
],
norm_layer
=
norm_layer
,
sr_ratio
=
sr_ratios
[
k
])
for
i
in
range
(
depths
[
k
])
])
self
.
blocks
.
append
(
_block
)
cur
+=
depths
[
k
]
self
.
norm
=
norm_layer
(
embed_dims
[
-
1
])
# cls_token
self
.
cls_token
=
self
.
create_parameter
(
shape
=
[
1
,
1
,
embed_dims
[
-
1
]],
default_initializer
=
zeros_
,
attr
=
paddle
.
ParamAttr
(
regularizer
=
L2Decay
(
0.0
)))
self
.
add_parameter
(
"cls_token"
,
self
.
cls_token
)
# classification head
self
.
head
=
nn
.
Linear
(
embed_dims
[
-
1
],
num_classes
)
if
num_classes
>
0
else
Identity
()
# init weights
for
pos_emb
in
self
.
pos_embeds
:
trunc_normal_
(
pos_emb
)
self
.
apply
(
self
.
_init_weights
)
def
_init_weights
(
self
,
m
):
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_
(
m
.
weight
)
if
isinstance
(
m
,
nn
.
Linear
)
and
m
.
bias
is
not
None
:
zeros_
(
m
.
bias
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
zeros_
(
m
.
bias
)
ones_
(
m
.
weight
)
def
forward_features
(
self
,
x
):
B
=
x
.
shape
[
0
]
for
i
in
range
(
len
(
self
.
depths
)):
x
,
(
H
,
W
)
=
self
.
patch_embeds
[
i
](
x
)
if
i
==
len
(
self
.
depths
)
-
1
:
cls_tokens
=
self
.
cls_token
.
expand
([
B
,
-
1
,
-
1
])
x
=
paddle
.
concat
([
cls_tokens
,
x
],
dim
=
1
)
x
=
x
+
self
.
pos_embeds
[
i
]
x
=
self
.
pos_drops
[
i
](
x
)
for
blk
in
self
.
blocks
[
i
]:
x
=
blk
(
x
,
H
,
W
)
if
i
<
len
(
self
.
depths
)
-
1
:
x
=
x
.
reshape
([
B
,
H
,
W
,
-
1
]).
transpose
(
[
0
,
3
,
1
,
2
]).
contiguous
()
x
=
self
.
norm
(
x
)
return
x
[:,
0
]
def
forward
(
self
,
x
):
x
=
self
.
forward_features
(
x
)
x
=
self
.
head
(
x
)
return
x
# PEG from https://arxiv.org/abs/2102.10882
class
PosCNN
(
nn
.
Layer
):
def
__init__
(
self
,
in_chans
,
embed_dim
=
768
,
s
=
1
):
super
().
__init__
()
self
.
proj
=
nn
.
Sequential
(
nn
.
Conv2D
(
in_chans
,
embed_dim
,
3
,
s
,
1
,
bias_attr
=
paddle
.
ParamAttr
(
regularizer
=
L2Decay
(
0.0
)),
groups
=
embed_dim
,
weight_attr
=
paddle
.
ParamAttr
(
regularizer
=
L2Decay
(
0.0
)),
))
self
.
s
=
s
def
forward
(
self
,
x
,
H
,
W
):
B
,
N
,
C
=
x
.
shape
feat_token
=
x
cnn_feat
=
feat_token
.
transpose
([
0
,
2
,
1
]).
reshape
([
B
,
C
,
H
,
W
])
if
self
.
s
==
1
:
x
=
self
.
proj
(
cnn_feat
)
+
cnn_feat
else
:
x
=
self
.
proj
(
cnn_feat
)
x
=
x
.
flatten
(
2
).
transpose
([
0
,
2
,
1
])
return
x
class
CPVTV2
(
PyramidVisionTransformer
):
"""
Use useful results from CPVT. PEG and GAP.
Therefore, cls token is no longer required.
PEG is used to encode the absolute position on the fly, which greatly affects the performance when input resolution
changes during the training (such as segmentation, detection)
"""
def
__init__
(
self
,
img_size
=
224
,
patch_size
=
4
,
in_chans
=
3
,
num_classes
=
1000
,
embed_dims
=
[
64
,
128
,
256
,
512
],
num_heads
=
[
1
,
2
,
4
,
8
],
mlp_ratios
=
[
4
,
4
,
4
,
4
],
qkv_bias
=
False
,
qk_scale
=
None
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.
,
norm_layer
=
nn
.
LayerNorm
,
depths
=
[
3
,
4
,
6
,
3
],
sr_ratios
=
[
8
,
4
,
2
,
1
],
block_cls
=
Block
):
super
().
__init__
(
img_size
,
patch_size
,
in_chans
,
num_classes
,
embed_dims
,
num_heads
,
mlp_ratios
,
qkv_bias
,
qk_scale
,
drop_rate
,
attn_drop_rate
,
drop_path_rate
,
norm_layer
,
depths
,
sr_ratios
,
block_cls
)
del
self
.
pos_embeds
del
self
.
cls_token
self
.
pos_block
=
nn
.
LayerList
(
[
PosCNN
(
embed_dim
,
embed_dim
)
for
embed_dim
in
embed_dims
])
self
.
apply
(
self
.
_init_weights
)
def
_init_weights
(
self
,
m
):
import
math
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_
(
m
.
weight
)
if
isinstance
(
m
,
nn
.
Linear
)
and
m
.
bias
is
not
None
:
zeros_
(
m
.
bias
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
zeros_
(
m
.
bias
)
ones_
(
m
.
weight
)
elif
isinstance
(
m
,
nn
.
Conv2D
):
fan_out
=
m
.
_kernel_size
[
0
]
*
m
.
_kernel_size
[
1
]
*
m
.
_out_channels
fan_out
//=
m
.
_groups
normal_
(
0
,
math
.
sqrt
(
2.0
/
fan_out
))(
m
.
weight
)
if
m
.
bias
is
not
None
:
zeros_
(
m
.
bias
)
elif
isinstance
(
m
,
nn
.
BatchNorm2D
):
m
.
weight
.
data
.
fill_
(
1.0
)
m
.
bias
.
data
.
zero_
()
def
forward_features
(
self
,
x
):
B
=
x
.
shape
[
0
]
for
i
in
range
(
len
(
self
.
depths
)):
x
,
(
H
,
W
)
=
self
.
patch_embeds
[
i
](
x
)
x
=
self
.
pos_drops
[
i
](
x
)
for
j
,
blk
in
enumerate
(
self
.
blocks
[
i
]):
x
=
blk
(
x
,
H
,
W
)
if
j
==
0
:
x
=
self
.
pos_block
[
i
](
x
,
H
,
W
)
# PEG here
if
i
<
len
(
self
.
depths
)
-
1
:
x
=
x
.
reshape
([
B
,
H
,
W
,
-
1
]).
transpose
([
0
,
3
,
1
,
2
])
x
=
self
.
norm
(
x
)
return
x
.
mean
(
axis
=
1
)
# GAP here
class
PCPVT
(
CPVTV2
):
def
__init__
(
self
,
img_size
=
224
,
patch_size
=
4
,
in_chans
=
3
,
num_classes
=
1000
,
embed_dims
=
[
64
,
128
,
256
],
num_heads
=
[
1
,
2
,
4
],
mlp_ratios
=
[
4
,
4
,
4
],
qkv_bias
=
False
,
qk_scale
=
None
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.
,
norm_layer
=
nn
.
LayerNorm
,
depths
=
[
4
,
4
,
4
],
sr_ratios
=
[
4
,
2
,
1
],
block_cls
=
SBlock
):
super
().
__init__
(
img_size
,
patch_size
,
in_chans
,
num_classes
,
embed_dims
,
num_heads
,
mlp_ratios
,
qkv_bias
,
qk_scale
,
drop_rate
,
attn_drop_rate
,
drop_path_rate
,
norm_layer
,
depths
,
sr_ratios
,
block_cls
)
class
ALTGVT
(
PCPVT
):
"""
alias Twins-SVT
"""
def
__init__
(
self
,
img_size
=
224
,
patch_size
=
4
,
in_chans
=
3
,
class_dim
=
1000
,
embed_dims
=
[
64
,
128
,
256
],
num_heads
=
[
1
,
2
,
4
],
mlp_ratios
=
[
4
,
4
,
4
],
qkv_bias
=
False
,
qk_scale
=
None
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.
,
norm_layer
=
nn
.
LayerNorm
,
depths
=
[
4
,
4
,
4
],
sr_ratios
=
[
4
,
2
,
1
],
block_cls
=
GroupBlock
,
wss
=
[
7
,
7
,
7
]):
super
().
__init__
(
img_size
,
patch_size
,
in_chans
,
class_dim
,
embed_dims
,
num_heads
,
mlp_ratios
,
qkv_bias
,
qk_scale
,
drop_rate
,
attn_drop_rate
,
drop_path_rate
,
norm_layer
,
depths
,
sr_ratios
,
block_cls
)
del
self
.
blocks
self
.
wss
=
wss
# transformer encoder
dpr
=
[
x
.
numpy
()[
0
]
for
x
in
paddle
.
linspace
(
0
,
drop_path_rate
,
sum
(
depths
))
]
# stochastic depth decay rule
cur
=
0
self
.
blocks
=
nn
.
LayerList
()
for
k
in
range
(
len
(
depths
)):
_block
=
nn
.
LayerList
([
block_cls
(
dim
=
embed_dims
[
k
],
num_heads
=
num_heads
[
k
],
mlp_ratio
=
mlp_ratios
[
k
],
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
drop
=
drop_rate
,
attn_drop
=
attn_drop_rate
,
drop_path
=
dpr
[
cur
+
i
],
norm_layer
=
norm_layer
,
sr_ratio
=
sr_ratios
[
k
],
ws
=
1
if
i
%
2
==
1
else
wss
[
k
])
for
i
in
range
(
depths
[
k
])
])
self
.
blocks
.
append
(
_block
)
cur
+=
depths
[
k
]
self
.
apply
(
self
.
_init_weights
)
def
pcpvt_small
(
pretrained
=
False
,
**
kwargs
):
model
=
CPVTV2
(
patch_size
=
4
,
embed_dims
=
[
64
,
128
,
320
,
512
],
num_heads
=
[
1
,
2
,
5
,
8
],
mlp_ratios
=
[
8
,
8
,
4
,
4
],
qkv_bias
=
True
,
norm_layer
=
partial
(
nn
.
LayerNorm
,
epsilon
=
1e-6
),
depths
=
[
3
,
4
,
6
,
3
],
sr_ratios
=
[
8
,
4
,
2
,
1
],
**
kwargs
)
return
model
def
pcpvt_base
(
pretrained
=
False
,
**
kwargs
):
model
=
CPVTV2
(
patch_size
=
4
,
embed_dims
=
[
64
,
128
,
320
,
512
],
num_heads
=
[
1
,
2
,
5
,
8
],
mlp_ratios
=
[
8
,
8
,
4
,
4
],
qkv_bias
=
True
,
norm_layer
=
partial
(
nn
.
LayerNorm
,
epsilon
=
1e-6
),
depths
=
[
3
,
4
,
18
,
3
],
sr_ratios
=
[
8
,
4
,
2
,
1
],
**
kwargs
)
return
model
def
pcpvt_large
(
pretrained
=
False
,
**
kwargs
):
model
=
CPVTV2
(
patch_size
=
4
,
embed_dims
=
[
64
,
128
,
320
,
512
],
num_heads
=
[
1
,
2
,
5
,
8
],
mlp_ratios
=
[
8
,
8
,
4
,
4
],
qkv_bias
=
True
,
norm_layer
=
partial
(
nn
.
LayerNorm
,
epsilon
=
1e-6
),
depths
=
[
3
,
8
,
27
,
3
],
sr_ratios
=
[
8
,
4
,
2
,
1
],
**
kwargs
)
return
model
def
alt_gvt_small
(
pretrained
=
False
,
**
kwargs
):
model
=
ALTGVT
(
patch_size
=
4
,
embed_dims
=
[
64
,
128
,
256
,
512
],
num_heads
=
[
2
,
4
,
8
,
16
],
mlp_ratios
=
[
4
,
4
,
4
,
4
],
qkv_bias
=
True
,
norm_layer
=
partial
(
nn
.
LayerNorm
,
epsilon
=
1e-6
),
depths
=
[
2
,
2
,
10
,
4
],
wss
=
[
7
,
7
,
7
,
7
],
sr_ratios
=
[
8
,
4
,
2
,
1
],
**
kwargs
)
return
model
def
alt_gvt_base
(
pretrained
=
False
,
**
args
):
model
=
ALTGVT
(
patch_size
=
4
,
embed_dims
=
[
96
,
192
,
384
,
768
],
num_heads
=
[
3
,
6
,
12
,
24
],
mlp_ratios
=
[
4
,
4
,
4
,
4
],
qkv_bias
=
True
,
norm_layer
=
partial
(
nn
.
LayerNorm
,
epsilon
=
1e-6
),
depths
=
[
2
,
2
,
18
,
2
],
wss
=
[
7
,
7
,
7
,
7
],
sr_ratios
=
[
8
,
4
,
2
,
1
],
**
args
)
return
model
def
alt_gvt_large
(
pretrained
=
False
,
**
kwargs
):
model
=
ALTGVT
(
patch_size
=
4
,
embed_dims
=
[
128
,
256
,
512
,
1024
],
num_heads
=
[
4
,
8
,
16
,
32
],
mlp_ratios
=
[
4
,
4
,
4
,
4
],
qkv_bias
=
True
,
norm_layer
=
partial
(
nn
.
LayerNorm
,
epsilon
=
1e-6
),
depths
=
[
2
,
2
,
18
,
2
],
wss
=
[
7
,
7
,
7
,
7
],
sr_ratios
=
[
8
,
4
,
2
,
1
],
**
kwargs
)
return
model
ppcls/arch/backbone/model_zoo/levit.py
0 → 100644
浏览文件 @
9e2a8e86
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
itertools
import
math
import
warnings
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
paddle.nn.initializer
import
TruncatedNormal
,
Constant
from
paddle.regularizer
import
L2Decay
from
.vision_transformer
import
trunc_normal_
,
zeros_
,
ones_
,
Identity
__all__
=
[
"LeViT_128S"
,
"LeViT_128"
,
"LeViT_192"
,
"LeViT_256"
,
"LeViT_384"
]
def
cal_attention_biases
(
attention_biases
,
attention_bias_idxs
):
gather_list
=
[]
attention_bias_t
=
paddle
.
transpose
(
attention_biases
,
(
1
,
0
))
for
idx
in
attention_bias_idxs
:
gather
=
paddle
.
gather
(
attention_bias_t
,
idx
)
gather_list
.
append
(
gather
)
shape0
,
shape1
=
attention_bias_idxs
.
shape
return
paddle
.
transpose
(
paddle
.
concat
(
gather_list
),
(
1
,
0
)).
reshape
(
(
0
,
shape0
,
shape1
))
class
Conv2d_BN
(
nn
.
Sequential
):
def
__init__
(
self
,
a
,
b
,
ks
=
1
,
stride
=
1
,
pad
=
0
,
dilation
=
1
,
groups
=
1
,
bn_weight_init
=
1
,
resolution
=-
10000
):
super
().
__init__
()
self
.
add_sublayer
(
'c'
,
nn
.
Conv2D
(
a
,
b
,
ks
,
stride
,
pad
,
dilation
,
groups
,
bias_attr
=
False
))
bn
=
nn
.
BatchNorm2D
(
b
)
ones_
(
bn
.
weight
)
zeros_
(
bn
.
bias
)
self
.
add_sublayer
(
'bn'
,
bn
)
class
Linear_BN
(
nn
.
Sequential
):
def
__init__
(
self
,
a
,
b
,
bn_weight_init
=
1
):
super
().
__init__
()
self
.
add_sublayer
(
'c'
,
nn
.
Linear
(
a
,
b
,
bias_attr
=
False
))
bn
=
nn
.
BatchNorm1D
(
b
)
ones_
(
bn
.
weight
)
zeros_
(
bn
.
bias
)
self
.
add_sublayer
(
'bn'
,
bn
)
def
forward
(
self
,
x
):
l
,
bn
=
self
.
_sub_layers
.
values
()
x
=
l
(
x
)
return
paddle
.
reshape
(
bn
(
x
.
flatten
(
0
,
1
)),
x
.
shape
)
class
BN_Linear
(
nn
.
Sequential
):
def
__init__
(
self
,
a
,
b
,
bias
=
True
,
std
=
0.02
):
super
().
__init__
()
self
.
add_sublayer
(
'bn'
,
nn
.
BatchNorm1D
(
a
))
l
=
nn
.
Linear
(
a
,
b
,
bias_attr
=
bias
)
trunc_normal_
(
l
.
weight
)
if
bias
:
zeros_
(
l
.
bias
)
self
.
add_sublayer
(
'l'
,
l
)
def
b16
(
n
,
activation
,
resolution
=
224
):
return
nn
.
Sequential
(
Conv2d_BN
(
3
,
n
//
8
,
3
,
2
,
1
,
resolution
=
resolution
),
activation
(),
Conv2d_BN
(
n
//
8
,
n
//
4
,
3
,
2
,
1
,
resolution
=
resolution
//
2
),
activation
(),
Conv2d_BN
(
n
//
4
,
n
//
2
,
3
,
2
,
1
,
resolution
=
resolution
//
4
),
activation
(),
Conv2d_BN
(
n
//
2
,
n
,
3
,
2
,
1
,
resolution
=
resolution
//
8
))
class
Residual
(
nn
.
Layer
):
def
__init__
(
self
,
m
,
drop
):
super
().
__init__
()
self
.
m
=
m
self
.
drop
=
drop
def
forward
(
self
,
x
):
if
self
.
training
and
self
.
drop
>
0
:
return
x
+
self
.
m
(
x
)
*
paddle
.
rand
(
x
.
size
(
0
),
1
,
1
,
device
=
x
.
device
).
ge_
(
self
.
drop
).
div
(
1
-
self
.
drop
).
detach
()
else
:
return
x
+
self
.
m
(
x
)
class
Attention
(
nn
.
Layer
):
def
__init__
(
self
,
dim
,
key_dim
,
num_heads
=
8
,
attn_ratio
=
4
,
activation
=
None
,
resolution
=
14
):
super
().
__init__
()
self
.
num_heads
=
num_heads
self
.
scale
=
key_dim
**-
0.5
self
.
key_dim
=
key_dim
self
.
nh_kd
=
nh_kd
=
key_dim
*
num_heads
self
.
d
=
int
(
attn_ratio
*
key_dim
)
self
.
dh
=
int
(
attn_ratio
*
key_dim
)
*
num_heads
self
.
attn_ratio
=
attn_ratio
self
.
h
=
self
.
dh
+
nh_kd
*
2
self
.
qkv
=
Linear_BN
(
dim
,
self
.
h
)
self
.
proj
=
nn
.
Sequential
(
activation
(),
Linear_BN
(
self
.
dh
,
dim
,
bn_weight_init
=
0
))
points
=
list
(
itertools
.
product
(
range
(
resolution
),
range
(
resolution
)))
N
=
len
(
points
)
attention_offsets
=
{}
idxs
=
[]
for
p1
in
points
:
for
p2
in
points
:
offset
=
(
abs
(
p1
[
0
]
-
p2
[
0
]),
abs
(
p1
[
1
]
-
p2
[
1
]))
if
offset
not
in
attention_offsets
:
attention_offsets
[
offset
]
=
len
(
attention_offsets
)
idxs
.
append
(
attention_offsets
[
offset
])
self
.
attention_biases
=
self
.
create_parameter
(
shape
=
(
num_heads
,
len
(
attention_offsets
)),
default_initializer
=
zeros_
,
attr
=
paddle
.
ParamAttr
(
regularizer
=
L2Decay
(
0.0
)))
tensor_idxs
=
paddle
.
to_tensor
(
idxs
,
dtype
=
'int64'
)
self
.
register_buffer
(
'attention_bias_idxs'
,
paddle
.
reshape
(
tensor_idxs
,
[
N
,
N
]))
@
paddle
.
no_grad
()
def
train
(
self
,
mode
=
True
):
if
mode
:
super
().
train
()
else
:
super
().
eval
()
if
mode
and
hasattr
(
self
,
'ab'
):
del
self
.
ab
else
:
self
.
ab
=
cal_attention_biases
(
self
.
attention_biases
,
self
.
attention_bias_idxs
)
def
forward
(
self
,
x
):
self
.
training
=
True
B
,
N
,
C
=
x
.
shape
qkv
=
self
.
qkv
(
x
)
qkv
=
paddle
.
reshape
(
qkv
,
[
B
,
N
,
self
.
num_heads
,
self
.
h
//
self
.
num_heads
])
q
,
k
,
v
=
paddle
.
split
(
qkv
,
[
self
.
key_dim
,
self
.
key_dim
,
self
.
d
],
axis
=
3
)
q
=
paddle
.
transpose
(
q
,
perm
=
[
0
,
2
,
1
,
3
])
k
=
paddle
.
transpose
(
k
,
perm
=
[
0
,
2
,
1
,
3
])
v
=
paddle
.
transpose
(
v
,
perm
=
[
0
,
2
,
1
,
3
])
k_transpose
=
paddle
.
transpose
(
k
,
perm
=
[
0
,
1
,
3
,
2
])
if
self
.
training
:
attention_biases
=
cal_attention_biases
(
self
.
attention_biases
,
self
.
attention_bias_idxs
)
else
:
attention_biases
=
self
.
ab
attn
=
((
q
@
k_transpose
)
*
self
.
scale
+
attention_biases
)
attn
=
F
.
softmax
(
attn
)
x
=
paddle
.
transpose
(
attn
@
v
,
perm
=
[
0
,
2
,
1
,
3
])
x
=
paddle
.
reshape
(
x
,
[
B
,
N
,
self
.
dh
])
x
=
self
.
proj
(
x
)
return
x
class
Subsample
(
nn
.
Layer
):
def
__init__
(
self
,
stride
,
resolution
):
super
().
__init__
()
self
.
stride
=
stride
self
.
resolution
=
resolution
def
forward
(
self
,
x
):
B
,
N
,
C
=
x
.
shape
x
=
paddle
.
reshape
(
x
,
[
B
,
self
.
resolution
,
self
.
resolution
,
C
])[:,
::
self
.
stride
,
::
self
.
stride
]
x
=
paddle
.
reshape
(
x
,
[
B
,
-
1
,
C
])
return
x
class
AttentionSubsample
(
nn
.
Layer
):
def
__init__
(
self
,
in_dim
,
out_dim
,
key_dim
,
num_heads
=
8
,
attn_ratio
=
2
,
activation
=
None
,
stride
=
2
,
resolution
=
14
,
resolution_
=
7
):
super
().
__init__
()
self
.
num_heads
=
num_heads
self
.
scale
=
key_dim
**-
0.5
self
.
key_dim
=
key_dim
self
.
nh_kd
=
nh_kd
=
key_dim
*
num_heads
self
.
d
=
int
(
attn_ratio
*
key_dim
)
self
.
dh
=
int
(
attn_ratio
*
key_dim
)
*
self
.
num_heads
self
.
attn_ratio
=
attn_ratio
self
.
resolution_
=
resolution_
self
.
resolution_2
=
resolution_
**
2
self
.
training
=
True
h
=
self
.
dh
+
nh_kd
self
.
kv
=
Linear_BN
(
in_dim
,
h
)
self
.
q
=
nn
.
Sequential
(
Subsample
(
stride
,
resolution
),
Linear_BN
(
in_dim
,
nh_kd
))
self
.
proj
=
nn
.
Sequential
(
activation
(),
Linear_BN
(
self
.
dh
,
out_dim
))
self
.
stride
=
stride
self
.
resolution
=
resolution
points
=
list
(
itertools
.
product
(
range
(
resolution
),
range
(
resolution
)))
points_
=
list
(
itertools
.
product
(
range
(
resolution_
),
range
(
resolution_
)))
N
=
len
(
points
)
N_
=
len
(
points_
)
attention_offsets
=
{}
idxs
=
[]
i
=
0
j
=
0
for
p1
in
points_
:
i
+=
1
for
p2
in
points
:
j
+=
1
size
=
1
offset
=
(
abs
(
p1
[
0
]
*
stride
-
p2
[
0
]
+
(
size
-
1
)
/
2
),
abs
(
p1
[
1
]
*
stride
-
p2
[
1
]
+
(
size
-
1
)
/
2
))
if
offset
not
in
attention_offsets
:
attention_offsets
[
offset
]
=
len
(
attention_offsets
)
idxs
.
append
(
attention_offsets
[
offset
])
self
.
attention_biases
=
self
.
create_parameter
(
shape
=
(
num_heads
,
len
(
attention_offsets
)),
default_initializer
=
zeros_
,
attr
=
paddle
.
ParamAttr
(
regularizer
=
L2Decay
(
0.0
)))
tensor_idxs_
=
paddle
.
to_tensor
(
idxs
,
dtype
=
'int64'
)
self
.
register_buffer
(
'attention_bias_idxs'
,
paddle
.
reshape
(
tensor_idxs_
,
[
N_
,
N
]))
@
paddle
.
no_grad
()
def
train
(
self
,
mode
=
True
):
if
mode
:
super
().
train
()
else
:
super
().
eval
()
if
mode
and
hasattr
(
self
,
'ab'
):
del
self
.
ab
else
:
self
.
ab
=
cal_attention_biases
(
self
.
attention_biases
,
self
.
attention_bias_idxs
)
def
forward
(
self
,
x
):
self
.
training
=
True
B
,
N
,
C
=
x
.
shape
kv
=
self
.
kv
(
x
)
kv
=
paddle
.
reshape
(
kv
,
[
B
,
N
,
self
.
num_heads
,
-
1
])
k
,
v
=
paddle
.
split
(
kv
,
[
self
.
key_dim
,
self
.
d
],
axis
=
3
)
k
=
paddle
.
transpose
(
k
,
perm
=
[
0
,
2
,
1
,
3
])
# BHNC
v
=
paddle
.
transpose
(
v
,
perm
=
[
0
,
2
,
1
,
3
])
q
=
paddle
.
reshape
(
self
.
q
(
x
),
[
B
,
self
.
resolution_2
,
self
.
num_heads
,
self
.
key_dim
])
q
=
paddle
.
transpose
(
q
,
perm
=
[
0
,
2
,
1
,
3
])
if
self
.
training
:
attention_biases
=
cal_attention_biases
(
self
.
attention_biases
,
self
.
attention_bias_idxs
)
else
:
attention_biases
=
self
.
ab
attn
=
(
q
@
paddle
.
transpose
(
k
,
perm
=
[
0
,
1
,
3
,
2
]))
*
self
.
scale
+
attention_biases
attn
=
F
.
softmax
(
attn
)
x
=
paddle
.
reshape
(
paddle
.
transpose
(
(
attn
@
v
),
perm
=
[
0
,
2
,
1
,
3
]),
[
B
,
-
1
,
self
.
dh
])
x
=
self
.
proj
(
x
)
return
x
class
LeViT
(
nn
.
Layer
):
""" Vision Transformer with support for patch or hybrid CNN input stage
"""
def
__init__
(
self
,
img_size
=
224
,
patch_size
=
16
,
in_chans
=
3
,
class_dim
=
1000
,
embed_dim
=
[
192
],
key_dim
=
[
64
],
depth
=
[
12
],
num_heads
=
[
3
],
attn_ratio
=
[
2
],
mlp_ratio
=
[
2
],
hybrid_backbone
=
None
,
down_ops
=
[],
attention_activation
=
nn
.
Hardswish
,
mlp_activation
=
nn
.
Hardswish
,
distillation
=
True
,
drop_path
=
0
):
super
().
__init__
()
self
.
class_dim
=
class_dim
self
.
num_features
=
embed_dim
[
-
1
]
self
.
embed_dim
=
embed_dim
self
.
distillation
=
distillation
self
.
patch_embed
=
hybrid_backbone
self
.
blocks
=
[]
down_ops
.
append
([
''
])
resolution
=
img_size
//
patch_size
for
i
,
(
ed
,
kd
,
dpth
,
nh
,
ar
,
mr
,
do
)
in
enumerate
(
zip
(
embed_dim
,
key_dim
,
depth
,
num_heads
,
attn_ratio
,
mlp_ratio
,
down_ops
)):
for
_
in
range
(
dpth
):
self
.
blocks
.
append
(
Residual
(
Attention
(
ed
,
kd
,
nh
,
attn_ratio
=
ar
,
activation
=
attention_activation
,
resolution
=
resolution
,
),
drop_path
))
if
mr
>
0
:
h
=
int
(
ed
*
mr
)
self
.
blocks
.
append
(
Residual
(
nn
.
Sequential
(
Linear_BN
(
ed
,
h
),
mlp_activation
(),
Linear_BN
(
h
,
ed
,
bn_weight_init
=
0
),
),
drop_path
))
if
do
[
0
]
==
'Subsample'
:
#('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
resolution_
=
(
resolution
-
1
)
//
do
[
5
]
+
1
self
.
blocks
.
append
(
AttentionSubsample
(
*
embed_dim
[
i
:
i
+
2
],
key_dim
=
do
[
1
],
num_heads
=
do
[
2
],
attn_ratio
=
do
[
3
],
activation
=
attention_activation
,
stride
=
do
[
5
],
resolution
=
resolution
,
resolution_
=
resolution_
))
resolution
=
resolution_
if
do
[
4
]
>
0
:
# mlp_ratio
h
=
int
(
embed_dim
[
i
+
1
]
*
do
[
4
])
self
.
blocks
.
append
(
Residual
(
nn
.
Sequential
(
Linear_BN
(
embed_dim
[
i
+
1
],
h
),
mlp_activation
(),
Linear_BN
(
h
,
embed_dim
[
i
+
1
],
bn_weight_init
=
0
),
),
drop_path
))
self
.
blocks
=
nn
.
Sequential
(
*
self
.
blocks
)
# Classifier head
self
.
head
=
BN_Linear
(
embed_dim
[
-
1
],
class_dim
)
if
class_dim
>
0
else
Identity
()
if
distillation
:
self
.
head_dist
=
BN_Linear
(
embed_dim
[
-
1
],
class_dim
)
if
class_dim
>
0
else
Identity
()
def
forward
(
self
,
x
):
x
=
self
.
patch_embed
(
x
)
x
=
x
.
flatten
(
2
)
x
=
paddle
.
transpose
(
x
,
perm
=
[
0
,
2
,
1
])
x
=
self
.
blocks
(
x
)
x
=
x
.
mean
(
1
)
if
self
.
distillation
:
x
=
self
.
head
(
x
),
self
.
head_dist
(
x
)
if
not
self
.
training
:
x
=
(
x
[
0
]
+
x
[
1
])
/
2
else
:
x
=
self
.
head
(
x
)
return
x
def
model_factory
(
C
,
D
,
X
,
N
,
drop_path
,
class_dim
,
distillation
):
embed_dim
=
[
int
(
x
)
for
x
in
C
.
split
(
'_'
)]
num_heads
=
[
int
(
x
)
for
x
in
N
.
split
(
'_'
)]
depth
=
[
int
(
x
)
for
x
in
X
.
split
(
'_'
)]
act
=
nn
.
Hardswish
model
=
LeViT
(
patch_size
=
16
,
embed_dim
=
embed_dim
,
num_heads
=
num_heads
,
key_dim
=
[
D
]
*
3
,
depth
=
depth
,
attn_ratio
=
[
2
,
2
,
2
],
mlp_ratio
=
[
2
,
2
,
2
],
down_ops
=
[
#('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
[
'Subsample'
,
D
,
embed_dim
[
0
]
//
D
,
4
,
2
,
2
],
[
'Subsample'
,
D
,
embed_dim
[
1
]
//
D
,
4
,
2
,
2
],
],
attention_activation
=
act
,
mlp_activation
=
act
,
hybrid_backbone
=
b16
(
embed_dim
[
0
],
activation
=
act
),
class_dim
=
class_dim
,
drop_path
=
drop_path
,
distillation
=
distillation
)
return
model
specification
=
{
'LeViT_128S'
:
{
'C'
:
'128_256_384'
,
'D'
:
16
,
'N'
:
'4_6_8'
,
'X'
:
'2_3_4'
,
'drop_path'
:
0
},
'LeViT_128'
:
{
'C'
:
'128_256_384'
,
'D'
:
16
,
'N'
:
'4_8_12'
,
'X'
:
'4_4_4'
,
'drop_path'
:
0
},
'LeViT_192'
:
{
'C'
:
'192_288_384'
,
'D'
:
32
,
'N'
:
'3_5_6'
,
'X'
:
'4_4_4'
,
'drop_path'
:
0
},
'LeViT_256'
:
{
'C'
:
'256_384_512'
,
'D'
:
32
,
'N'
:
'4_6_8'
,
'X'
:
'4_4_4'
,
'drop_path'
:
0
},
'LeViT_384'
:
{
'C'
:
'384_512_768'
,
'D'
:
32
,
'N'
:
'6_9_12'
,
'X'
:
'4_4_4'
,
'drop_path'
:
0.1
},
}
def
LeViT_128S
(
class_dim
=
1000
,
distillation
=
True
,
pretrained
=
False
):
return
model_factory
(
**
specification
[
'LeViT_128S'
],
class_dim
=
class_dim
,
distillation
=
distillation
)
def
LeViT_128
(
class_dim
=
1000
,
distillation
=
True
):
return
model_factory
(
**
specification
[
'LeViT_128'
],
class_dim
=
class_dim
,
distillation
=
distillation
)
def
LeViT_192
(
class_dim
=
1000
,
distillation
=
True
):
return
model_factory
(
**
specification
[
'LeViT_192'
],
class_dim
=
class_dim
,
distillation
=
distillation
)
def
LeViT_256
(
class_dim
=
1000
,
distillation
=
False
):
return
model_factory
(
**
specification
[
'LeViT_256'
],
class_dim
=
class_dim
,
distillation
=
distillation
)
def
LeViT_384
(
class_dim
=
1000
,
distillation
=
True
):
return
model_factory
(
**
specification
[
'LeViT_384'
],
class_dim
=
class_dim
,
distillation
=
distillation
)
ppcls/arch/backbone/model_zoo/vision_transformer.py
浏览文件 @
9e2a8e86
...
@@ -12,10 +12,12 @@
...
@@ -12,10 +12,12 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
collections
import
Callable
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
import
paddle.nn
as
nn
import
paddle.nn
as
nn
from
paddle.nn.initializer
import
TruncatedNormal
,
Constant
from
paddle.nn.initializer
import
TruncatedNormal
,
Constant
,
Normal
__all__
=
[
__all__
=
[
"VisionTransformer"
,
"ViT_small_patch16_224"
,
"ViT_base_patch16_224"
,
"VisionTransformer"
,
"ViT_small_patch16_224"
,
"ViT_base_patch16_224"
,
...
@@ -25,6 +27,7 @@ __all__ = [
...
@@ -25,6 +27,7 @@ __all__ = [
]
]
trunc_normal_
=
TruncatedNormal
(
std
=
.
02
)
trunc_normal_
=
TruncatedNormal
(
std
=
.
02
)
normal_
=
Normal
zeros_
=
Constant
(
value
=
0.
)
zeros_
=
Constant
(
value
=
0.
)
ones_
=
Constant
(
value
=
1.
)
ones_
=
Constant
(
value
=
1.
)
...
@@ -141,7 +144,13 @@ class Block(nn.Layer):
...
@@ -141,7 +144,13 @@ class Block(nn.Layer):
norm_layer
=
'nn.LayerNorm'
,
norm_layer
=
'nn.LayerNorm'
,
epsilon
=
1e-5
):
epsilon
=
1e-5
):
super
().
__init__
()
super
().
__init__
()
self
.
norm1
=
eval
(
norm_layer
)(
dim
,
epsilon
=
epsilon
)
if
isinstance
(
norm_layer
,
str
):
self
.
norm1
=
eval
(
norm_layer
)(
dim
,
epsilon
=
epsilon
)
elif
isinstance
(
norm_layer
,
Callable
):
self
.
norm1
=
norm_layer
(
dim
)
else
:
raise
TypeError
(
"The norm_layer must be str or paddle.nn.layer.Layer class"
)
self
.
attn
=
Attention
(
self
.
attn
=
Attention
(
dim
,
dim
,
num_heads
=
num_heads
,
num_heads
=
num_heads
,
...
@@ -151,7 +160,13 @@ class Block(nn.Layer):
...
@@ -151,7 +160,13 @@ class Block(nn.Layer):
proj_drop
=
drop
)
proj_drop
=
drop
)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
else
Identity
()
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
else
Identity
()
self
.
norm2
=
eval
(
norm_layer
)(
dim
,
epsilon
=
epsilon
)
if
isinstance
(
norm_layer
,
str
):
self
.
norm2
=
eval
(
norm_layer
)(
dim
,
epsilon
=
epsilon
)
elif
isinstance
(
norm_layer
,
Callable
):
self
.
norm2
=
norm_layer
(
dim
)
else
:
raise
TypeError
(
"The norm_layer must be str or paddle.nn.layer.Layer class"
)
mlp_hidden_dim
=
int
(
dim
*
mlp_ratio
)
mlp_hidden_dim
=
int
(
dim
*
mlp_ratio
)
self
.
mlp
=
Mlp
(
in_features
=
dim
,
self
.
mlp
=
Mlp
(
in_features
=
dim
,
hidden_features
=
mlp_hidden_dim
,
hidden_features
=
mlp_hidden_dim
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录