Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
3c1c576d
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
1 年多 前同步成功
通知
696
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3c1c576d
编写于
7月 07, 2021
作者:
S
shangliang Xu
提交者:
GitHub
7月 07, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Transformer] Add transformer base code (#3612)
* Add DETR * drop return_pad_mask in PadBatch
上级
43515234
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
1882 addition
and
12 deletion
+1882
-12
ppdet/data/transform/operators.py
ppdet/data/transform/operators.py
+353
-1
ppdet/modeling/__init__.py
ppdet/modeling/__init__.py
+2
-0
ppdet/modeling/architectures/__init__.py
ppdet/modeling/architectures/__init__.py
+2
-0
ppdet/modeling/architectures/detr.py
ppdet/modeling/architectures/detr.py
+93
-0
ppdet/modeling/heads/__init__.py
ppdet/modeling/heads/__init__.py
+2
-0
ppdet/modeling/heads/detr_head.py
ppdet/modeling/heads/detr_head.py
+278
-0
ppdet/modeling/initializer.py
ppdet/modeling/initializer.py
+18
-4
ppdet/modeling/layers.py
ppdet/modeling/layers.py
+179
-0
ppdet/modeling/losses/__init__.py
ppdet/modeling/losses/__init__.py
+2
-0
ppdet/modeling/losses/detr_loss.py
ppdet/modeling/losses/detr_loss.py
+230
-0
ppdet/modeling/post_process.py
ppdet/modeling/post_process.py
+65
-6
ppdet/modeling/transformers/__init__.py
ppdet/modeling/transformers/__init__.py
+23
-0
ppdet/modeling/transformers/detr_transformer.py
ppdet/modeling/transformers/detr_transformer.py
+351
-0
ppdet/modeling/transformers/matchers.py
ppdet/modeling/transformers/matchers.py
+123
-0
ppdet/modeling/transformers/position_encoding.py
ppdet/modeling/transformers/position_encoding.py
+101
-0
ppdet/modeling/transformers/utils.py
ppdet/modeling/transformers/utils.py
+58
-0
ppdet/optimizer.py
ppdet/optimizer.py
+2
-1
未找到文件。
ppdet/data/transform/operators.py
浏览文件 @
3c1c576d
...
...
@@ -40,6 +40,7 @@ from PIL import Image, ImageEnhance, ImageDraw
from
ppdet.core.workspace
import
serializable
from
ppdet.modeling.layers
import
AnchorGrid
from
ppdet.modeling
import
bbox_utils
from
..reader
import
Compose
from
.op_helper
import
(
satisfy_sample_constraint
,
filter_and_process
,
generate_sample_bbox
,
clip_bbox
,
data_anchor_sampling
,
...
...
@@ -2348,7 +2349,7 @@ class RandomResizeCrop(BaseOperator):
for
gt_segm
in
sample
[
'gt_segm'
]
]
sample
[
'gt_segm'
]
=
np
.
asarray
(
masks
).
astype
(
np
.
uint8
)
return
sample
...
...
@@ -2528,3 +2529,354 @@ class Mosaic(BaseOperator):
sample
[
'difficult'
]
=
difficult
return
sample
@
register_op
class
RandomSelect
(
BaseOperator
):
"""
Randomly choose a transformation between transforms1 and transforms2,
and the probability of choosing transforms1 is p.
"""
def
__init__
(
self
,
transforms1
,
transforms2
,
p
=
0.5
):
super
(
RandomSelect
,
self
).
__init__
()
self
.
transforms1
=
Compose
(
transforms1
)
self
.
transforms2
=
Compose
(
transforms2
)
self
.
p
=
p
def
apply
(
self
,
sample
,
context
=
None
):
if
random
.
random
()
<
self
.
p
:
return
self
.
transforms1
(
sample
)
return
self
.
transforms2
(
sample
)
@
register_op
class
RandomShortSideResize
(
BaseOperator
):
def
__init__
(
self
,
short_side_sizes
,
max_size
=
None
,
interp
=
cv2
.
INTER_LINEAR
,
random_interp
=
False
):
"""
Resize the image randomly according to the short side. If max_size is not None,
the long side is scaled according to max_size. The whole process will be keep ratio.
Args:
short_side_sizes (list|tuple): Image target short side size.
max_size (int): The size of the longest side of image after resize.
interp (int): The interpolation method.
random_interp (bool): Whether random select interpolation method.
"""
super
(
RandomShortSideResize
,
self
).
__init__
()
assert
isinstance
(
short_side_sizes
,
Sequence
),
"short_side_sizes must be List or Tuple"
self
.
short_side_sizes
=
short_side_sizes
self
.
max_size
=
max_size
self
.
interp
=
interp
self
.
random_interp
=
random_interp
self
.
interps
=
[
cv2
.
INTER_NEAREST
,
cv2
.
INTER_LINEAR
,
cv2
.
INTER_AREA
,
cv2
.
INTER_CUBIC
,
cv2
.
INTER_LANCZOS4
,
]
def
get_size_with_aspect_ratio
(
self
,
image_shape
,
size
,
max_size
=
None
):
h
,
w
=
image_shape
if
max_size
is
not
None
:
min_original_size
=
float
(
min
((
w
,
h
)))
max_original_size
=
float
(
max
((
w
,
h
)))
if
max_original_size
/
min_original_size
*
size
>
max_size
:
size
=
int
(
round
(
max_size
*
min_original_size
/
max_original_size
))
if
(
w
<=
h
and
w
==
size
)
or
(
h
<=
w
and
h
==
size
):
return
(
w
,
h
)
if
w
<
h
:
ow
=
size
oh
=
int
(
size
*
h
/
w
)
else
:
oh
=
size
ow
=
int
(
size
*
w
/
h
)
return
(
ow
,
oh
)
def
resize
(
self
,
sample
,
target_size
,
max_size
=
None
,
interp
=
cv2
.
INTER_LINEAR
):
im
=
sample
[
'image'
]
if
not
isinstance
(
im
,
np
.
ndarray
):
raise
TypeError
(
"{}: image type is not numpy."
.
format
(
self
))
if
len
(
im
.
shape
)
!=
3
:
raise
ImageError
(
'{}: image is not 3-dimensional.'
.
format
(
self
))
target_size
=
self
.
get_size_with_aspect_ratio
(
im
.
shape
[:
2
],
target_size
,
max_size
)
im_scale_y
,
im_scale_x
=
target_size
[
1
]
/
im
.
shape
[
0
],
target_size
[
0
]
/
im
.
shape
[
1
]
sample
[
'image'
]
=
cv2
.
resize
(
im
,
target_size
,
interpolation
=
interp
)
sample
[
'im_shape'
]
=
np
.
asarray
(
target_size
[::
-
1
],
dtype
=
np
.
float32
)
if
'scale_factor'
in
sample
:
scale_factor
=
sample
[
'scale_factor'
]
sample
[
'scale_factor'
]
=
np
.
asarray
(
[
scale_factor
[
0
]
*
im_scale_y
,
scale_factor
[
1
]
*
im_scale_x
],
dtype
=
np
.
float32
)
else
:
sample
[
'scale_factor'
]
=
np
.
asarray
(
[
im_scale_y
,
im_scale_x
],
dtype
=
np
.
float32
)
# apply bbox
if
'gt_bbox'
in
sample
and
len
(
sample
[
'gt_bbox'
])
>
0
:
sample
[
'gt_bbox'
]
=
self
.
apply_bbox
(
sample
[
'gt_bbox'
],
[
im_scale_x
,
im_scale_y
],
target_size
)
# apply polygon
if
'gt_poly'
in
sample
and
len
(
sample
[
'gt_poly'
])
>
0
:
sample
[
'gt_poly'
]
=
self
.
apply_segm
(
sample
[
'gt_poly'
],
im
.
shape
[:
2
],
[
im_scale_x
,
im_scale_y
])
# apply semantic
if
'semantic'
in
sample
and
sample
[
'semantic'
]:
semantic
=
sample
[
'semantic'
]
semantic
=
cv2
.
resize
(
semantic
.
astype
(
'float32'
),
target_size
,
interpolation
=
self
.
interp
)
semantic
=
np
.
asarray
(
semantic
).
astype
(
'int32'
)
semantic
=
np
.
expand_dims
(
semantic
,
0
)
sample
[
'semantic'
]
=
semantic
# apply gt_segm
if
'gt_segm'
in
sample
and
len
(
sample
[
'gt_segm'
])
>
0
:
masks
=
[
cv2
.
resize
(
gt_segm
,
target_size
,
interpolation
=
cv2
.
INTER_NEAREST
)
for
gt_segm
in
sample
[
'gt_segm'
]
]
sample
[
'gt_segm'
]
=
np
.
asarray
(
masks
).
astype
(
np
.
uint8
)
return
sample
def
apply_bbox
(
self
,
bbox
,
scale
,
size
):
im_scale_x
,
im_scale_y
=
scale
resize_w
,
resize_h
=
size
bbox
[:,
0
::
2
]
*=
im_scale_x
bbox
[:,
1
::
2
]
*=
im_scale_y
bbox
[:,
0
::
2
]
=
np
.
clip
(
bbox
[:,
0
::
2
],
0
,
resize_w
)
bbox
[:,
1
::
2
]
=
np
.
clip
(
bbox
[:,
1
::
2
],
0
,
resize_h
)
return
bbox
.
astype
(
'float32'
)
def
apply_segm
(
self
,
segms
,
im_size
,
scale
):
def
_resize_poly
(
poly
,
im_scale_x
,
im_scale_y
):
resized_poly
=
np
.
array
(
poly
).
astype
(
'float32'
)
resized_poly
[
0
::
2
]
*=
im_scale_x
resized_poly
[
1
::
2
]
*=
im_scale_y
return
resized_poly
.
tolist
()
def
_resize_rle
(
rle
,
im_h
,
im_w
,
im_scale_x
,
im_scale_y
):
if
'counts'
in
rle
and
type
(
rle
[
'counts'
])
==
list
:
rle
=
mask_util
.
frPyObjects
(
rle
,
im_h
,
im_w
)
mask
=
mask_util
.
decode
(
rle
)
mask
=
cv2
.
resize
(
mask
,
None
,
None
,
fx
=
im_scale_x
,
fy
=
im_scale_y
,
interpolation
=
self
.
interp
)
rle
=
mask_util
.
encode
(
np
.
array
(
mask
,
order
=
'F'
,
dtype
=
np
.
uint8
))
return
rle
im_h
,
im_w
=
im_size
im_scale_x
,
im_scale_y
=
scale
resized_segms
=
[]
for
segm
in
segms
:
if
is_poly
(
segm
):
# Polygon format
resized_segms
.
append
([
_resize_poly
(
poly
,
im_scale_x
,
im_scale_y
)
for
poly
in
segm
])
else
:
# RLE format
import
pycocotools.mask
as
mask_util
resized_segms
.
append
(
_resize_rle
(
segm
,
im_h
,
im_w
,
im_scale_x
,
im_scale_y
))
return
resized_segms
def
apply
(
self
,
sample
,
context
=
None
):
target_size
=
random
.
choice
(
self
.
short_side_sizes
)
interp
=
random
.
choice
(
self
.
interps
)
if
self
.
random_interp
else
self
.
interp
return
self
.
resize
(
sample
,
target_size
,
self
.
max_size
,
interp
)
@
register_op
class
RandomSizeCrop
(
BaseOperator
):
"""
Cut the image randomly according to `min_size` and `max_size`
"""
def
__init__
(
self
,
min_size
,
max_size
):
super
(
RandomSizeCrop
,
self
).
__init__
()
self
.
min_size
=
min_size
self
.
max_size
=
max_size
from
paddle.vision.transforms.functional
import
crop
as
paddle_crop
self
.
paddle_crop
=
paddle_crop
@
staticmethod
def
get_crop_params
(
img_shape
,
output_size
):
"""Get parameters for ``crop`` for a random crop.
Args:
img_shape (list|tuple): Image's height and width.
output_size (list|tuple): Expected output size of the crop.
Returns:
tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
"""
h
,
w
=
img_shape
th
,
tw
=
output_size
if
h
+
1
<
th
or
w
+
1
<
tw
:
raise
ValueError
(
"Required crop size {} is larger then input image size {}"
.
format
((
th
,
tw
),
(
h
,
w
)))
if
w
==
tw
and
h
==
th
:
return
0
,
0
,
h
,
w
i
=
random
.
randint
(
0
,
h
-
th
+
1
)
j
=
random
.
randint
(
0
,
w
-
tw
+
1
)
return
i
,
j
,
th
,
tw
def
crop
(
self
,
sample
,
region
):
image_shape
=
sample
[
'image'
].
shape
[:
2
]
sample
[
'image'
]
=
self
.
paddle_crop
(
sample
[
'image'
],
*
region
)
keep_index
=
None
# apply bbox
if
'gt_bbox'
in
sample
and
len
(
sample
[
'gt_bbox'
])
>
0
:
sample
[
'gt_bbox'
]
=
self
.
apply_bbox
(
sample
[
'gt_bbox'
],
region
)
bbox
=
sample
[
'gt_bbox'
].
reshape
([
-
1
,
2
,
2
])
area
=
(
bbox
[:,
1
,
:]
-
bbox
[:,
0
,
:]).
prod
(
axis
=
1
)
keep_index
=
np
.
where
(
area
>
0
)[
0
]
sample
[
'gt_bbox'
]
=
sample
[
'gt_bbox'
][
keep_index
]
if
len
(
keep_index
)
>
0
else
np
.
zeros
(
[
0
,
4
],
dtype
=
np
.
float32
)
sample
[
'gt_class'
]
=
sample
[
'gt_class'
][
keep_index
]
if
len
(
keep_index
)
>
0
else
np
.
zeros
(
[
0
,
1
],
dtype
=
np
.
float32
)
if
'gt_score'
in
sample
:
sample
[
'gt_score'
]
=
sample
[
'gt_score'
][
keep_index
]
if
len
(
keep_index
)
>
0
else
np
.
zeros
(
[
0
,
1
],
dtype
=
np
.
float32
)
if
'is_crowd'
in
sample
:
sample
[
'is_crowd'
]
=
sample
[
'is_crowd'
][
keep_index
]
if
len
(
keep_index
)
>
0
else
np
.
zeros
(
[
0
,
1
],
dtype
=
np
.
float32
)
# apply polygon
if
'gt_poly'
in
sample
and
len
(
sample
[
'gt_poly'
])
>
0
:
sample
[
'gt_poly'
]
=
self
.
apply_segm
(
sample
[
'gt_poly'
],
region
,
image_shape
)
if
keep_index
is
not
None
:
sample
[
'gt_poly'
]
=
sample
[
'gt_poly'
][
keep_index
]
# apply gt_segm
if
'gt_segm'
in
sample
and
len
(
sample
[
'gt_segm'
])
>
0
:
i
,
j
,
h
,
w
=
region
sample
[
'gt_segm'
]
=
sample
[
'gt_segm'
][:,
i
:
i
+
h
,
j
:
j
+
w
]
if
keep_index
is
not
None
:
sample
[
'gt_segm'
]
=
sample
[
'gt_segm'
][
keep_index
]
return
sample
def
apply_bbox
(
self
,
bbox
,
region
):
i
,
j
,
h
,
w
=
region
region_size
=
np
.
asarray
([
w
,
h
])
crop_bbox
=
bbox
-
np
.
asarray
([
j
,
i
,
j
,
i
])
crop_bbox
=
np
.
minimum
(
crop_bbox
.
reshape
([
-
1
,
2
,
2
]),
region_size
)
crop_bbox
=
crop_bbox
.
clip
(
min
=
0
)
return
crop_bbox
.
reshape
([
-
1
,
4
]).
astype
(
'float32'
)
def
apply_segm
(
self
,
segms
,
region
,
image_shape
):
def
_crop_poly
(
segm
,
crop
):
xmin
,
ymin
,
xmax
,
ymax
=
crop
crop_coord
=
[
xmin
,
ymin
,
xmin
,
ymax
,
xmax
,
ymax
,
xmax
,
ymin
]
crop_p
=
np
.
array
(
crop_coord
).
reshape
(
4
,
2
)
crop_p
=
Polygon
(
crop_p
)
crop_segm
=
list
()
for
poly
in
segm
:
poly
=
np
.
array
(
poly
).
reshape
(
len
(
poly
)
//
2
,
2
)
polygon
=
Polygon
(
poly
)
if
not
polygon
.
is_valid
:
exterior
=
polygon
.
exterior
multi_lines
=
exterior
.
intersection
(
exterior
)
polygons
=
shapely
.
ops
.
polygonize
(
multi_lines
)
polygon
=
MultiPolygon
(
polygons
)
multi_polygon
=
list
()
if
isinstance
(
polygon
,
MultiPolygon
):
multi_polygon
=
copy
.
deepcopy
(
polygon
)
else
:
multi_polygon
.
append
(
copy
.
deepcopy
(
polygon
))
for
per_polygon
in
multi_polygon
:
inter
=
per_polygon
.
intersection
(
crop_p
)
if
not
inter
:
continue
if
isinstance
(
inter
,
(
MultiPolygon
,
GeometryCollection
)):
for
part
in
inter
:
if
not
isinstance
(
part
,
Polygon
):
continue
part
=
np
.
squeeze
(
np
.
array
(
part
.
exterior
.
coords
[:
-
1
]).
reshape
(
1
,
-
1
))
part
[
0
::
2
]
-=
xmin
part
[
1
::
2
]
-=
ymin
crop_segm
.
append
(
part
.
tolist
())
elif
isinstance
(
inter
,
Polygon
):
crop_poly
=
np
.
squeeze
(
np
.
array
(
inter
.
exterior
.
coords
[:
-
1
]).
reshape
(
1
,
-
1
))
crop_poly
[
0
::
2
]
-=
xmin
crop_poly
[
1
::
2
]
-=
ymin
crop_segm
.
append
(
crop_poly
.
tolist
())
else
:
continue
return
crop_segm
def
_crop_rle
(
rle
,
crop
,
height
,
width
):
if
'counts'
in
rle
and
type
(
rle
[
'counts'
])
==
list
:
rle
=
mask_util
.
frPyObjects
(
rle
,
height
,
width
)
mask
=
mask_util
.
decode
(
rle
)
mask
=
mask
[
crop
[
1
]:
crop
[
3
],
crop
[
0
]:
crop
[
2
]]
rle
=
mask_util
.
encode
(
np
.
array
(
mask
,
order
=
'F'
,
dtype
=
np
.
uint8
))
return
rle
i
,
j
,
h
,
w
=
region
crop
=
[
j
,
i
,
j
+
w
,
i
+
h
]
height
,
width
=
image_shape
crop_segms
=
[]
for
segm
in
segms
:
if
is_poly
(
segm
):
import
copy
import
shapely.ops
from
shapely.geometry
import
Polygon
,
MultiPolygon
,
GeometryCollection
# Polygon format
crop_segms
.
append
(
_crop_poly
(
segm
,
crop
))
else
:
# RLE format
import
pycocotools.mask
as
mask_util
crop_segms
.
append
(
_crop_rle
(
segm
,
crop
,
height
,
width
))
return
crop_segms
def
apply
(
self
,
sample
,
context
=
None
):
h
=
random
.
randint
(
self
.
min_size
,
min
(
sample
[
'image'
].
shape
[
0
],
self
.
max_size
))
w
=
random
.
randint
(
self
.
min_size
,
min
(
sample
[
'image'
].
shape
[
1
],
self
.
max_size
))
region
=
self
.
get_crop_params
(
sample
[
'image'
].
shape
[:
2
],
[
h
,
w
])
return
self
.
crop
(
sample
,
region
)
ppdet/modeling/__init__.py
浏览文件 @
3c1c576d
...
...
@@ -27,6 +27,7 @@ from . import post_process
from
.
import
layers
from
.
import
reid
from
.
import
mot
from
.
import
transformers
from
.ops
import
*
from
.backbones
import
*
...
...
@@ -39,3 +40,4 @@ from .post_process import *
from
.layers
import
*
from
.reid
import
*
from
.mot
import
*
from
.transformers
import
*
ppdet/modeling/architectures/__init__.py
浏览文件 @
3c1c576d
...
...
@@ -21,6 +21,7 @@ from . import jde
from
.
import
deepsort
from
.
import
fairmot
from
.
import
centernet
from
.
import
detr
from
.meta_arch
import
*
from
.faster_rcnn
import
*
...
...
@@ -39,3 +40,4 @@ from .deepsort import *
from
.fairmot
import
*
from
.centernet
import
*
from
.blazeface
import
*
from
.detr
import
*
ppdet/modeling/architectures/detr.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
.meta_arch
import
BaseArch
from
ppdet.core.workspace
import
register
,
create
__all__
=
[
'DETR'
]
@
register
class
DETR
(
BaseArch
):
__category__
=
'architecture'
__inject__
=
[
'post_process'
]
def
__init__
(
self
,
backbone
,
transformer
,
detr_head
,
post_process
=
'DETRBBoxPostProcess'
):
super
(
DETR
,
self
).
__init__
()
self
.
backbone
=
backbone
self
.
transformer
=
transformer
self
.
detr_head
=
detr_head
self
.
post_process
=
post_process
@
classmethod
def
from_config
(
cls
,
cfg
,
*
args
,
**
kwargs
):
# backbone
backbone
=
create
(
cfg
[
'backbone'
])
# transformer
kwargs
=
{
'input_shape'
:
backbone
.
out_shape
}
transformer
=
create
(
cfg
[
'transformer'
],
**
kwargs
)
# head
kwargs
=
{
'hidden_dim'
:
transformer
.
hidden_dim
,
'nhead'
:
transformer
.
nhead
,
'input_shape'
:
backbone
.
out_shape
}
detr_head
=
create
(
cfg
[
'detr_head'
],
**
kwargs
)
return
{
'backbone'
:
backbone
,
'transformer'
:
transformer
,
"detr_head"
:
detr_head
,
}
def
_forward
(
self
):
# Backbone
body_feats
=
self
.
backbone
(
self
.
inputs
)
# Transformer
out_transformer
=
self
.
transformer
(
body_feats
,
self
.
inputs
[
'pad_mask'
])
# DETR Head
if
self
.
training
:
return
self
.
detr_head
(
out_transformer
,
body_feats
,
self
.
inputs
)
else
:
preds
=
self
.
detr_head
(
out_transformer
,
body_feats
)
bbox
,
bbox_num
=
self
.
post_process
(
preds
,
self
.
inputs
[
'im_shape'
],
self
.
inputs
[
'scale_factor'
])
return
bbox
,
bbox_num
def
get_loss
(
self
,
):
losses
=
self
.
_forward
()
losses
.
update
({
'loss'
:
paddle
.
add_n
([
v
for
k
,
v
in
losses
.
items
()
if
'log'
not
in
k
])
})
return
losses
def
get_pred
(
self
):
bbox_pred
,
bbox_num
=
self
.
_forward
()
output
=
{
"bbox"
:
bbox_pred
,
"bbox_num"
:
bbox_num
,
}
return
output
ppdet/modeling/heads/__init__.py
浏览文件 @
3c1c576d
...
...
@@ -25,6 +25,7 @@ from . import face_head
from
.
import
s2anet_head
from
.
import
keypoint_hrhrnet_head
from
.
import
centernet_head
from
.
import
detr_head
from
.bbox_head
import
*
from
.mask_head
import
*
...
...
@@ -39,3 +40,4 @@ from .face_head import *
from
.s2anet_head
import
*
from
.keypoint_hrhrnet_head
import
*
from
.centernet_head
import
*
from
.detr_head
import
*
ppdet/modeling/heads/detr_head.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
ppdet.core.workspace
import
register
import
pycocotools.mask
as
mask_util
from
..initializer
import
*
__all__
=
[
'DETRHead'
]
class
MLP
(
nn
.
Layer
):
def
__init__
(
self
,
input_dim
,
hidden_dim
,
output_dim
,
num_layers
):
super
().
__init__
()
self
.
num_layers
=
num_layers
h
=
[
hidden_dim
]
*
(
num_layers
-
1
)
self
.
layers
=
nn
.
LayerList
(
nn
.
Linear
(
n
,
k
)
for
n
,
k
in
zip
([
input_dim
]
+
h
,
h
+
[
output_dim
]))
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
for
l
in
self
.
layers
:
linear_init_
(
l
)
def
forward
(
self
,
x
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
x
=
F
.
relu
(
layer
(
x
))
if
i
<
self
.
num_layers
-
1
else
layer
(
x
)
return
x
class
MultiHeadAttentionMap
(
nn
.
Layer
):
"""This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
def
__init__
(
self
,
query_dim
,
hidden_dim
,
num_heads
,
dropout
=
0.0
,
bias
=
True
):
super
().
__init__
()
self
.
num_heads
=
num_heads
self
.
hidden_dim
=
hidden_dim
self
.
dropout
=
nn
.
Dropout
(
dropout
)
weight_attr
=
paddle
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
XavierUniform
())
bias_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
())
if
bias
else
False
self
.
q_proj
=
nn
.
Linear
(
query_dim
,
hidden_dim
,
weight_attr
,
bias_attr
)
self
.
k_proj
=
nn
.
Conv2D
(
query_dim
,
hidden_dim
,
1
,
weight_attr
=
weight_attr
,
bias_attr
=
bias_attr
)
self
.
normalize_fact
=
float
(
hidden_dim
/
self
.
num_heads
)
**-
0.5
def
forward
(
self
,
q
,
k
,
mask
=
None
):
q
=
self
.
q_proj
(
q
)
k
=
self
.
k_proj
(
k
)
bs
,
num_queries
,
n
,
c
,
h
,
w
=
q
.
shape
[
0
],
q
.
shape
[
1
],
self
.
num_heads
,
\
self
.
hidden_dim
//
self
.
num_heads
,
k
.
shape
[
-
2
],
k
.
shape
[
-
1
]
qh
=
q
.
reshape
([
bs
,
num_queries
,
n
,
c
])
kh
=
k
.
reshape
([
bs
,
n
,
c
,
h
,
w
])
# weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
qh
=
qh
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
-
1
,
num_queries
,
c
])
kh
=
kh
.
reshape
([
-
1
,
c
,
h
*
w
])
weights
=
paddle
.
bmm
(
qh
*
self
.
normalize_fact
,
kh
).
reshape
(
[
bs
,
n
,
num_queries
,
h
,
w
]).
transpose
([
0
,
2
,
1
,
3
,
4
])
if
mask
is
not
None
:
weights
+=
mask
# fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
weights
=
F
.
softmax
(
weights
.
flatten
(
3
),
axis
=-
1
).
reshape
(
weights
.
shape
)
weights
=
self
.
dropout
(
weights
)
return
weights
class
MaskHeadFPNConv
(
nn
.
Layer
):
"""
Simple convolutional head, using group norm.
Upsampling is done using a FPN approach
"""
def
__init__
(
self
,
input_dim
,
fpn_dims
,
context_dim
,
num_groups
=
8
):
super
().
__init__
()
inter_dims
=
[
input_dim
,
]
+
[
context_dim
//
(
2
**
i
)
for
i
in
range
(
1
,
5
)]
weight_attr
=
paddle
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
KaimingUniform
())
bias_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
())
self
.
conv0
=
self
.
_make_layers
(
input_dim
,
input_dim
,
3
,
num_groups
,
weight_attr
,
bias_attr
)
self
.
conv_inter
=
nn
.
LayerList
()
for
in_dims
,
out_dims
in
zip
(
inter_dims
[:
-
1
],
inter_dims
[
1
:]):
self
.
conv_inter
.
append
(
self
.
_make_layers
(
in_dims
,
out_dims
,
3
,
num_groups
,
weight_attr
,
bias_attr
))
self
.
conv_out
=
nn
.
Conv2D
(
inter_dims
[
-
1
],
1
,
3
,
padding
=
1
,
weight_attr
=
weight_attr
,
bias_attr
=
bias_attr
)
self
.
adapter
=
nn
.
LayerList
()
for
i
in
range
(
len
(
fpn_dims
)):
self
.
adapter
.
append
(
nn
.
Conv2D
(
fpn_dims
[
i
],
inter_dims
[
i
+
1
],
1
,
weight_attr
=
weight_attr
,
bias_attr
=
bias_attr
))
def
_make_layers
(
self
,
in_dims
,
out_dims
,
kernel_size
,
num_groups
,
weight_attr
=
None
,
bias_attr
=
None
):
return
nn
.
Sequential
(
nn
.
Conv2D
(
in_dims
,
out_dims
,
kernel_size
,
padding
=
kernel_size
//
2
,
weight_attr
=
weight_attr
,
bias_attr
=
bias_attr
),
nn
.
GroupNorm
(
num_groups
,
out_dims
),
nn
.
ReLU
())
def
forward
(
self
,
x
,
bbox_attention_map
,
fpns
):
x
=
paddle
.
concat
([
x
.
tile
([
bbox_attention_map
.
shape
[
1
],
1
,
1
,
1
]),
bbox_attention_map
.
flatten
(
0
,
1
)
],
1
)
x
=
self
.
conv0
(
x
)
for
inter_layer
,
adapter_layer
,
feat
in
zip
(
self
.
conv_inter
[:
-
1
],
self
.
adapter
,
fpns
):
feat
=
adapter_layer
(
feat
).
tile
(
[
bbox_attention_map
.
shape
[
1
],
1
,
1
,
1
])
x
=
inter_layer
(
x
)
x
=
feat
+
F
.
interpolate
(
x
,
size
=
feat
.
shape
[
-
2
:])
x
=
self
.
conv_inter
[
-
1
](
x
)
x
=
self
.
conv_out
(
x
)
return
x
@
register
class
DETRHead
(
nn
.
Layer
):
__shared__
=
[
'num_classes'
,
'hidden_dim'
,
'use_focal_loss'
]
__inject__
=
[
'loss'
]
def
__init__
(
self
,
num_classes
=
80
,
hidden_dim
=
256
,
nhead
=
8
,
num_mlp_layers
=
3
,
loss
=
'DETRLoss'
,
fpn_dims
=
[
1024
,
512
,
256
],
with_mask_head
=
False
,
use_focal_loss
=
False
):
super
(
DETRHead
,
self
).
__init__
()
# add background class
self
.
num_classes
=
num_classes
if
use_focal_loss
else
num_classes
+
1
self
.
hidden_dim
=
hidden_dim
self
.
loss
=
loss
self
.
with_mask_head
=
with_mask_head
self
.
use_focal_loss
=
use_focal_loss
self
.
score_head
=
nn
.
Linear
(
hidden_dim
,
self
.
num_classes
)
self
.
bbox_head
=
MLP
(
hidden_dim
,
hidden_dim
,
output_dim
=
4
,
num_layers
=
num_mlp_layers
)
if
self
.
with_mask_head
:
self
.
bbox_attention
=
MultiHeadAttentionMap
(
hidden_dim
,
hidden_dim
,
nhead
)
self
.
mask_head
=
MaskHeadFPNConv
(
hidden_dim
+
nhead
,
fpn_dims
,
hidden_dim
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
linear_init_
(
self
.
score_head
)
@
classmethod
def
from_config
(
cls
,
cfg
,
hidden_dim
,
nhead
,
input_shape
):
return
{
'hidden_dim'
:
hidden_dim
,
'nhead'
:
nhead
,
'fpn_dims'
:
[
i
.
channels
for
i
in
input_shape
[::
-
1
]][
1
:]
}
@
staticmethod
def
get_gt_mask_from_polygons
(
gt_poly
,
pad_mask
):
out_gt_mask
=
[]
for
polygons
,
padding
in
zip
(
gt_poly
,
pad_mask
):
height
,
width
=
int
(
padding
[:,
0
].
sum
()),
int
(
padding
[
0
,
:].
sum
())
masks
=
[]
for
obj_poly
in
polygons
:
rles
=
mask_util
.
frPyObjects
(
obj_poly
,
height
,
width
)
rle
=
mask_util
.
merge
(
rles
)
masks
.
append
(
paddle
.
to_tensor
(
mask_util
.
decode
(
rle
)).
astype
(
'float32'
))
masks
=
paddle
.
stack
(
masks
)
masks_pad
=
paddle
.
zeros
(
[
masks
.
shape
[
0
],
pad_mask
.
shape
[
1
],
pad_mask
.
shape
[
2
]])
masks_pad
[:,
:
height
,
:
width
]
=
masks
out_gt_mask
.
append
(
masks_pad
)
return
out_gt_mask
def
forward
(
self
,
out_transformer
,
body_feats
,
inputs
=
None
):
r
"""
Args:
out_transformer (Tuple): (feats: [num_levels, batch_size,
num_queries, hidden_dim],
memory: [batch_size, hidden_dim, h, w],
src_proj: [batch_size, h*w, hidden_dim],
src_mask: [batch_size, 1, 1, h, w])
body_feats (List(Tensor)): list[[B, C, H, W]]
inputs (dict): dict(inputs)
"""
feats
,
memory
,
src_proj
,
src_mask
=
out_transformer
outputs_logit
=
self
.
score_head
(
feats
)
outputs_bbox
=
F
.
sigmoid
(
self
.
bbox_head
(
feats
))
outputs_seg
=
None
if
self
.
with_mask_head
:
bbox_attention_map
=
self
.
bbox_attention
(
feats
[
-
1
],
memory
,
src_mask
)
fpn_feats
=
[
a
for
a
in
body_feats
[::
-
1
]][
1
:]
outputs_seg
=
self
.
mask_head
(
src_proj
,
bbox_attention_map
,
fpn_feats
)
outputs_seg
=
outputs_seg
.
reshape
([
feats
.
shape
[
1
],
feats
.
shape
[
2
],
outputs_seg
.
shape
[
-
2
],
outputs_seg
.
shape
[
-
1
]
])
if
self
.
training
:
assert
inputs
is
not
None
assert
'gt_bbox'
in
inputs
and
'gt_class'
in
inputs
gt_mask
=
self
.
get_gt_mask_from_polygons
(
inputs
[
'gt_poly'
],
inputs
[
'pad_mask'
])
if
'gt_poly'
in
inputs
else
None
return
self
.
loss
(
outputs_bbox
,
outputs_logit
,
inputs
[
'gt_bbox'
],
inputs
[
'gt_class'
],
masks
=
outputs_seg
,
gt_mask
=
gt_mask
)
else
:
return
(
outputs_bbox
[
-
1
],
outputs_logit
[
-
1
],
outputs_seg
)
ppdet/modeling/initializer.py
浏览文件 @
3c1c576d
...
...
@@ -28,6 +28,8 @@ __all__ = [
'xavier_normal_'
,
'kaiming_uniform_'
,
'kaiming_normal_'
,
'linear_init_'
,
'conv_init_'
,
'reset_initialized_parameter'
,
]
...
...
@@ -46,7 +48,7 @@ def _no_grad_normal_(tensor, mean=0., std=1.):
return
tensor
def
_no_grad_fill_
(
tensor
,
value
=
0
):
def
_no_grad_fill_
(
tensor
,
value
=
0
.
):
with
paddle
.
no_grad
():
v
=
paddle
.
rand
(
shape
=
tensor
.
shape
,
dtype
=
tensor
.
dtype
)
v
[...]
=
value
...
...
@@ -80,7 +82,7 @@ def normal_(tensor, mean=0., std=1.):
return
_no_grad_normal_
(
tensor
,
mean
,
std
)
def
constant_
(
tensor
,
value
=
0
):
def
constant_
(
tensor
,
value
=
0
.
):
"""
Modified tensor inspace using constant_
Args:
...
...
@@ -150,7 +152,7 @@ def xavier_uniform_(tensor, gain=1., reverse=False):
Modified tensor inspace using xavier_uniform_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (
str
): super parameter, 1. default.
gain (
float
): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
...
...
@@ -166,7 +168,7 @@ def xavier_normal_(tensor, gain=1., reverse=False):
Modified tensor inspace using xavier_normal_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (
str
): super parameter, 1. default.
gain (
float
): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
...
...
@@ -260,6 +262,18 @@ def kaiming_normal_(tensor,
return
_no_grad_normal_
(
tensor
,
0
,
std
)
def
linear_init_
(
module
):
bound
=
1
/
math
.
sqrt
(
module
.
weight
.
shape
[
0
])
uniform_
(
module
.
weight
,
-
bound
,
bound
)
uniform_
(
module
.
bias
,
-
bound
,
bound
)
def
conv_init_
(
module
):
bound
=
1
/
math
.
sqrt
(
math
.
prod
(
module
.
weight
.
shape
[
1
:]))
uniform_
(
module
.
weight
,
-
bound
,
bound
)
uniform_
(
module
.
bias
,
-
bound
,
bound
)
@
paddle
.
no_grad
()
def
reset_initialized_parameter
(
model
,
include_self
=
True
):
"""
...
...
ppdet/modeling/layers.py
浏览文件 @
3c1c576d
...
...
@@ -29,8 +29,11 @@ from paddle.regularizer import L2Decay
from
ppdet.core.workspace
import
register
,
serializable
from
ppdet.modeling.bbox_utils
import
delta2bbox
from
.
import
ops
from
.initializer
import
xavier_uniform_
,
constant_
from
paddle.vision.ops
import
DeformConv2D
from
paddle.nn.layer
import
transformer
_convert_attention_mask
=
transformer
.
_convert_attention_mask
def
_to_list
(
l
):
...
...
@@ -1187,3 +1190,179 @@ class Concat(nn.Layer):
def
extra_repr
(
self
):
return
'dim={}'
.
format
(
self
.
dim
)
class
MultiHeadAttention
(
nn
.
Layer
):
"""
Attention mapps queries and a set of key-value pairs to outputs, and
Multi-Head Attention performs multiple parallel attention to jointly attending
to information from different representation subspaces.
Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
for more details.
Parameters:
embed_dim (int): The expected feature size in the input and output.
num_heads (int): The number of heads in multi-head attention.
dropout (float, optional): The dropout probability used on attention
weights to drop some attention targets. 0 for no dropout. Default 0
kdim (int, optional): The feature size in key. If None, assumed equal to
`embed_dim`. Default None.
vdim (int, optional): The feature size in value. If None, assumed equal to
`embed_dim`. Default None.
need_weights (bool, optional): Indicate whether to return the attention
weights. Default False.
Examples:
.. code-block:: python
import paddle
# encoder input: [batch_size, sequence_length, d_model]
query = paddle.rand((2, 4, 128))
# self attention mask: [batch_size, num_heads, query_len, query_len]
attn_mask = paddle.rand((2, 2, 4, 4))
multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
"""
def
__init__
(
self
,
embed_dim
,
num_heads
,
dropout
=
0.
,
kdim
=
None
,
vdim
=
None
,
need_weights
=
False
):
super
(
MultiHeadAttention
,
self
).
__init__
()
self
.
embed_dim
=
embed_dim
self
.
kdim
=
kdim
if
kdim
is
not
None
else
embed_dim
self
.
vdim
=
vdim
if
vdim
is
not
None
else
embed_dim
self
.
_qkv_same_embed_dim
=
self
.
kdim
==
embed_dim
and
self
.
vdim
==
embed_dim
self
.
num_heads
=
num_heads
self
.
dropout
=
dropout
self
.
need_weights
=
need_weights
self
.
head_dim
=
embed_dim
//
num_heads
assert
self
.
head_dim
*
num_heads
==
self
.
embed_dim
,
"embed_dim must be divisible by num_heads"
if
self
.
_qkv_same_embed_dim
:
self
.
in_proj_weight
=
self
.
create_parameter
(
shape
=
[
embed_dim
,
3
*
embed_dim
],
attr
=
None
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
self
.
in_proj_bias
=
self
.
create_parameter
(
shape
=
[
3
*
embed_dim
],
attr
=
None
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
else
:
self
.
q_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
)
self
.
k_proj
=
nn
.
Linear
(
self
.
kdim
,
embed_dim
)
self
.
v_proj
=
nn
.
Linear
(
self
.
vdim
,
embed_dim
)
self
.
out_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
)
self
.
_type_list
=
(
'q_proj'
,
'k_proj'
,
'v_proj'
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
xavier_uniform_
(
p
)
else
:
constant_
(
p
)
def
compute_qkv
(
self
,
tensor
,
index
):
if
self
.
_qkv_same_embed_dim
:
tensor
=
F
.
linear
(
x
=
tensor
,
weight
=
self
.
in_proj_weight
[:,
index
*
self
.
embed_dim
:(
index
+
1
)
*
self
.
embed_dim
],
bias
=
self
.
in_proj_bias
[
index
*
self
.
embed_dim
:(
index
+
1
)
*
self
.
embed_dim
]
if
self
.
in_proj_bias
is
not
None
else
None
)
else
:
tensor
=
getattr
(
self
,
self
.
_type_list
[
index
])(
tensor
)
tensor
=
tensor
.
reshape
(
[
0
,
0
,
self
.
num_heads
,
self
.
head_dim
]).
transpose
([
0
,
2
,
1
,
3
])
return
tensor
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
attn_mask
=
None
):
r
"""
Applies multi-head attention to map queries and a set of key-value pairs
to outputs.
Parameters:
query (Tensor): The queries for multi-head attention. It is a
tensor with shape `[batch_size, query_length, embed_dim]`. The
data type should be float32 or float64.
key (Tensor, optional): The keys for multi-head attention. It is
a tensor with shape `[batch_size, key_length, kdim]`. The
data type should be float32 or float64. If None, use `query` as
`key`. Default None.
value (Tensor, optional): The values for multi-head attention. It
is a tensor with shape `[batch_size, value_length, vdim]`.
The data type should be float32 or float64. If None, use `query` as
`value`. Default None.
attn_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
When the data type is bool, the unwanted positions have `False`
values and the others have `True` values. When the data type is
int, the unwanted positions have 0 values and the others have 1
values. When the data type is float, the unwanted positions have
`-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None.
Returns:
Tensor|tuple: It is a tensor that has the same shape and data type \
as `query`, representing attention output. Or a tuple if \
`need_weights` is True or `cache` is not None. If `need_weights` \
is True, except for attention output, the tuple also includes \
the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
If `cache` is not None, the tuple then includes the new cache \
having the same type as `cache`, and if it is `StaticCache`, it \
is same as the input `cache`, if it is `Cache`, the new cache \
reserves tensors concatanating raw tensors with intermediate \
results of current query.
"""
key
=
query
if
key
is
None
else
key
value
=
query
if
value
is
None
else
value
# compute q ,k ,v
q
,
k
,
v
=
(
self
.
compute_qkv
(
t
,
i
)
for
i
,
t
in
enumerate
([
query
,
key
,
value
]))
# scale dot product attention
product
=
paddle
.
matmul
(
x
=
q
,
y
=
k
,
transpose_y
=
True
)
scaling
=
float
(
self
.
head_dim
)
**-
0.5
product
=
product
*
scaling
if
attn_mask
is
not
None
:
# Support bool or int mask
attn_mask
=
_convert_attention_mask
(
attn_mask
,
product
.
dtype
)
product
=
product
+
attn_mask
weights
=
F
.
softmax
(
product
)
if
self
.
dropout
:
weights
=
F
.
dropout
(
weights
,
self
.
dropout
,
training
=
self
.
training
,
mode
=
"upscale_in_train"
)
out
=
paddle
.
matmul
(
weights
,
v
)
# combine heads
out
=
paddle
.
transpose
(
out
,
perm
=
[
0
,
2
,
1
,
3
])
out
=
paddle
.
reshape
(
x
=
out
,
shape
=
[
0
,
0
,
out
.
shape
[
2
]
*
out
.
shape
[
3
]])
# project to output
out
=
self
.
out_proj
(
out
)
outs
=
[
out
]
if
self
.
need_weights
:
outs
.
append
(
weights
)
return
out
if
len
(
outs
)
==
1
else
tuple
(
outs
)
ppdet/modeling/losses/__init__.py
浏览文件 @
3c1c576d
...
...
@@ -22,6 +22,7 @@ from . import ctfocal_loss
from
.
import
keypoint_loss
from
.
import
jde_loss
from
.
import
fairmot_loss
from
.
import
detr_loss
from
.yolo_loss
import
*
from
.iou_aware_loss
import
*
...
...
@@ -33,3 +34,4 @@ from .ctfocal_loss import *
from
.keypoint_loss
import
*
from
.jde_loss
import
*
from
.fairmot_loss
import
*
from
.detr_loss
import
*
ppdet/modeling/losses/detr_loss.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
ppdet.core.workspace
import
register
from
.iou_loss
import
GIoULoss
from
..transformers
import
bbox_cxcywh_to_xyxy
,
bbox_overlaps
,
sigmoid_focal_loss
__all__
=
[
'DETRLoss'
]
@
register
class
DETRLoss
(
nn
.
Layer
):
__shared__
=
[
'num_classes'
,
'use_focal_loss'
]
__inject__
=
[
'matcher'
]
def
__init__
(
self
,
num_classes
=
80
,
matcher
=
'HungarianMatcher'
,
loss_coeff
=
{
'class'
:
1
,
'bbox'
:
5
,
'giou'
:
2
,
'no_object'
:
0.1
,
'mask'
:
1
,
'dice'
:
1
},
aux_loss
=
True
,
use_focal_loss
=
False
):
r
"""
Args:
num_classes (int): The number of classes.
matcher (HungarianMatcher): It computes an assignment between the targets
and the predictions of the network.
loss_coeff (dict): The coefficient of loss.
aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
use_focal_loss (bool): Use focal loss or not.
"""
super
(
DETRLoss
,
self
).
__init__
()
self
.
num_classes
=
num_classes
self
.
matcher
=
matcher
self
.
loss_coeff
=
loss_coeff
self
.
aux_loss
=
aux_loss
self
.
use_focal_loss
=
use_focal_loss
if
not
self
.
use_focal_loss
:
self
.
loss_coeff
[
'class'
]
=
paddle
.
full
([
num_classes
+
1
],
loss_coeff
[
'class'
])
self
.
loss_coeff
[
'class'
][
-
1
]
=
loss_coeff
[
'no_object'
]
self
.
giou_loss
=
GIoULoss
()
def
_get_loss_class
(
self
,
logits
,
gt_class
,
match_indices
,
bg_index
,
num_gts
):
# logits: [b, query, num_classes], gt_class: list[[n, 1]]
target_label
=
paddle
.
full
(
logits
.
shape
[:
2
],
bg_index
,
dtype
=
'int64'
)
bs
,
num_query_objects
=
target_label
.
shape
if
sum
(
len
(
a
)
for
a
in
gt_class
)
>
0
:
index
,
updates
=
self
.
_get_index_updates
(
num_query_objects
,
gt_class
,
match_indices
)
target_label
=
paddle
.
scatter
(
target_label
.
reshape
([
-
1
,
1
]),
index
,
updates
.
astype
(
'int64'
))
target_label
=
target_label
.
reshape
([
bs
,
num_query_objects
])
if
self
.
use_focal_loss
:
target_label
=
F
.
one_hot
(
target_label
,
self
.
num_classes
+
1
)[:,
:,
:
-
1
]
return
{
'loss_class'
:
self
.
loss_coeff
[
'class'
]
*
sigmoid_focal_loss
(
logits
,
target_label
,
num_gts
/
num_query_objects
)
if
self
.
use_focal_loss
else
F
.
cross_entropy
(
logits
,
target_label
,
weight
=
self
.
loss_coeff
[
'class'
])
}
def
_get_loss_bbox
(
self
,
boxes
,
gt_bbox
,
match_indices
,
num_gts
):
# boxes: [b, query, 4], gt_bbox: list[[n, 4]]
loss
=
dict
()
if
sum
(
len
(
a
)
for
a
in
gt_bbox
)
==
0
:
loss
[
'loss_bbox'
]
=
paddle
.
to_tensor
([
0.
])
loss
[
'loss_giou'
]
=
paddle
.
to_tensor
([
0.
])
return
loss
src_bbox
,
target_bbox
=
self
.
_get_src_target_assign
(
boxes
,
gt_bbox
,
match_indices
)
loss
[
'loss_bbox'
]
=
self
.
loss_coeff
[
'bbox'
]
*
F
.
l1_loss
(
src_bbox
,
target_bbox
,
reduction
=
'sum'
)
/
num_gts
loss
[
'loss_giou'
]
=
self
.
giou_loss
(
bbox_cxcywh_to_xyxy
(
src_bbox
),
bbox_cxcywh_to_xyxy
(
target_bbox
))
loss
[
'loss_giou'
]
=
loss
[
'loss_giou'
].
sum
()
/
num_gts
loss
[
'loss_giou'
]
=
self
.
loss_coeff
[
'giou'
]
*
loss
[
'loss_giou'
]
return
loss
def
_get_loss_mask
(
self
,
masks
,
gt_mask
,
match_indices
,
num_gts
):
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
loss
=
dict
()
if
sum
(
len
(
a
)
for
a
in
gt_mask
)
==
0
:
loss
[
'loss_mask'
]
=
paddle
.
to_tensor
([
0.
])
loss
[
'loss_dice'
]
=
paddle
.
to_tensor
([
0.
])
return
loss
src_masks
,
target_masks
=
self
.
_get_src_target_assign
(
masks
,
gt_mask
,
match_indices
)
src_masks
=
F
.
interpolate
(
src_masks
.
unsqueeze
(
0
),
size
=
target_masks
.
shape
[
-
2
:],
mode
=
"bilinear"
)[
0
]
loss
[
'loss_mask'
]
=
self
.
loss_coeff
[
'mask'
]
*
F
.
sigmoid_focal_loss
(
src_masks
,
target_masks
,
paddle
.
to_tensor
(
[
num_gts
],
dtype
=
'float32'
))
loss
[
'loss_dice'
]
=
self
.
loss_coeff
[
'dice'
]
*
self
.
_dice_loss
(
src_masks
,
target_masks
,
num_gts
)
return
loss
def
_dice_loss
(
self
,
inputs
,
targets
,
num_gts
):
inputs
=
F
.
sigmoid
(
inputs
)
inputs
=
inputs
.
flatten
(
1
)
targets
=
targets
.
flatten
(
1
)
numerator
=
2
*
(
inputs
*
targets
).
sum
(
1
)
denominator
=
inputs
.
sum
(
-
1
)
+
targets
.
sum
(
-
1
)
loss
=
1
-
(
numerator
+
1
)
/
(
denominator
+
1
)
return
loss
.
sum
()
/
num_gts
def
_get_loss_aux
(
self
,
boxes
,
logits
,
gt_bbox
,
gt_class
,
bg_index
,
num_gts
):
loss_class
=
[]
loss_bbox
=
[]
loss_giou
=
[]
for
aux_boxes
,
aux_logits
in
zip
(
boxes
,
logits
):
match_indices
=
self
.
matcher
(
aux_boxes
,
aux_logits
,
gt_bbox
,
gt_class
)
loss_class
.
append
(
self
.
_get_loss_class
(
aux_logits
,
gt_class
,
match_indices
,
bg_index
,
num_gts
)[
'loss_class'
])
loss_
=
self
.
_get_loss_bbox
(
aux_boxes
,
gt_bbox
,
match_indices
,
num_gts
)
loss_bbox
.
append
(
loss_
[
'loss_bbox'
])
loss_giou
.
append
(
loss_
[
'loss_giou'
])
loss
=
{
'loss_class_aux'
:
paddle
.
add_n
(
loss_class
),
'loss_bbox_aux'
:
paddle
.
add_n
(
loss_bbox
),
'loss_giou_aux'
:
paddle
.
add_n
(
loss_giou
)
}
return
loss
def
_get_index_updates
(
self
,
num_query_objects
,
target
,
match_indices
):
batch_idx
=
paddle
.
concat
([
paddle
.
full_like
(
src
,
i
)
for
i
,
(
src
,
_
)
in
enumerate
(
match_indices
)
])
src_idx
=
paddle
.
concat
([
src
for
(
src
,
_
)
in
match_indices
])
src_idx
+=
(
batch_idx
*
num_query_objects
)
target_assign
=
paddle
.
concat
([
paddle
.
gather
(
t
,
dst
,
axis
=
0
)
for
t
,
(
_
,
dst
)
in
zip
(
target
,
match_indices
)
])
return
src_idx
,
target_assign
def
_get_src_target_assign
(
self
,
src
,
target
,
match_indices
):
src_assign
=
paddle
.
concat
([
paddle
.
gather
(
t
,
I
,
axis
=
0
)
if
len
(
I
)
>
0
else
paddle
.
zeros
([
0
,
t
.
shape
[
-
1
]])
for
t
,
(
I
,
_
)
in
zip
(
src
,
match_indices
)
])
target_assign
=
paddle
.
concat
([
paddle
.
gather
(
t
,
J
,
axis
=
0
)
if
len
(
J
)
>
0
else
paddle
.
zeros
([
0
,
t
.
shape
[
-
1
]])
for
t
,
(
_
,
J
)
in
zip
(
target
,
match_indices
)
])
return
src_assign
,
target_assign
def
forward
(
self
,
boxes
,
logits
,
gt_bbox
,
gt_class
,
masks
=
None
,
gt_mask
=
None
):
r
"""
Args:
boxes (Tensor): [l, b, query, 4]
logits (Tensor): [l, b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
masks (Tensor, optional): [b, query, h, w]
gt_mask (List(Tensor), optional): list[[n, H, W]]
"""
match_indices
=
self
.
matcher
(
boxes
[
-
1
].
detach
(),
logits
[
-
1
].
detach
(),
gt_bbox
,
gt_class
)
num_gts
=
sum
(
len
(
a
)
for
a
in
gt_bbox
)
try
:
# TODO: Paddle does not have a "paddle.distributed.is_initialized()"
num_gts
=
paddle
.
to_tensor
([
num_gts
],
dtype
=
paddle
.
float32
)
paddle
.
distributed
.
all_reduce
(
num_gts
)
num_gts
=
paddle
.
clip
(
num_gts
/
paddle
.
distributed
.
get_world_size
(),
min
=
1
).
item
()
except
:
num_gts
=
max
(
num_gts
,
1
)
total_loss
=
dict
()
total_loss
.
update
(
self
.
_get_loss_class
(
logits
[
-
1
],
gt_class
,
match_indices
,
self
.
num_classes
,
num_gts
))
total_loss
.
update
(
self
.
_get_loss_bbox
(
boxes
[
-
1
],
gt_bbox
,
match_indices
,
num_gts
))
if
masks
is
not
None
and
gt_mask
is
not
None
:
total_loss
.
update
(
self
.
_get_loss_mask
(
masks
,
gt_mask
,
match_indices
,
num_gts
))
if
self
.
aux_loss
:
total_loss
.
update
(
self
.
_get_loss_aux
(
boxes
[:
-
1
],
logits
[:
-
1
],
gt_bbox
,
gt_class
,
self
.
num_classes
,
num_gts
))
return
total_loss
ppdet/modeling/post_process.py
浏览文件 @
3c1c576d
...
...
@@ -19,18 +19,16 @@ import paddle.nn.functional as F
from
ppdet.core.workspace
import
register
from
ppdet.modeling.bbox_utils
import
nonempty_bbox
,
rbox2poly
,
rbox2poly
from
ppdet.modeling.layers
import
TTFBox
from
.transformers
import
bbox_cxcywh_to_xyxy
try
:
from
collections.abc
import
Sequence
except
Exception
:
from
collections
import
Sequence
__all__
=
[
'BBoxPostProcess'
,
'MaskPostProcess'
,
'FCOSPostProcess'
,
'S2ANetBBoxPostProcess'
,
'JDEBBoxPostProcess'
,
'CenterNetPostProcess'
,
'BBoxPostProcess'
,
'MaskPostProcess'
,
'FCOSPostProcess'
,
'S2ANetBBoxPostProcess'
,
'JDEBBoxPostProcess'
,
'CenterNetPostProcess'
,
'DETRBBoxPostProcess'
]
...
...
@@ -492,3 +490,64 @@ class CenterNetPostProcess(TTFBox):
else
:
results
=
paddle
.
concat
([
clses
,
scores
,
bboxes
],
axis
=
1
)
return
results
,
paddle
.
shape
(
results
)[
0
:
1
]
@
register
class
DETRBBoxPostProcess
(
object
):
__shared__
=
[
'num_classes'
,
'use_focal_loss'
]
__inject__
=
[]
def
__init__
(
self
,
num_classes
=
80
,
num_top_queries
=
100
,
use_focal_loss
=
False
):
super
(
DETRBBoxPostProcess
,
self
).
__init__
()
self
.
num_classes
=
num_classes
self
.
num_top_queries
=
num_top_queries
self
.
use_focal_loss
=
use_focal_loss
def
__call__
(
self
,
head_out
,
im_shape
,
scale_factor
):
"""
Decode the bbox.
Args:
head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
im_shape (Tensor): The shape of the input image.
scale_factor (Tensor): The scale factor of the input image.
Returns:
bbox_pred (Tensor): The output prediction with shape [N, 6], including
labels, scores and bboxes. The size of bboxes are corresponding
to the input image, the bboxes may be used in other branch.
bbox_num (Tensor): The number of prediction boxes of each batch with
shape [bs], and is N.
"""
bboxes
,
logits
,
masks
=
head_out
bbox_pred
=
bbox_cxcywh_to_xyxy
(
bboxes
)
origin_shape
=
paddle
.
floor
(
im_shape
/
scale_factor
+
0.5
)
img_h
,
img_w
=
origin_shape
.
unbind
(
1
)
origin_shape
=
paddle
.
stack
(
[
img_w
,
img_h
,
img_w
,
img_h
],
axis
=-
1
).
unsqueeze
(
0
)
bbox_pred
*=
origin_shape
scores
=
F
.
sigmoid
(
logits
)
if
self
.
use_focal_loss
else
F
.
softmax
(
logits
)[:,
:,
:
-
1
]
scores
,
labels
=
scores
.
max
(
-
1
),
scores
.
argmax
(
-
1
)
if
scores
.
shape
[
1
]
>
self
.
num_top_queries
:
scores
,
index
=
paddle
.
topk
(
scores
,
self
.
num_top_queries
,
axis
=-
1
)
labels
=
paddle
.
stack
(
[
paddle
.
gather
(
l
,
i
)
for
l
,
i
in
zip
(
labels
,
index
)])
bbox_pred
=
paddle
.
stack
(
[
paddle
.
gather
(
b
,
i
)
for
b
,
i
in
zip
(
bbox_pred
,
index
)])
bbox_pred
=
paddle
.
concat
(
[
labels
.
unsqueeze
(
-
1
).
astype
(
'float32'
),
scores
.
unsqueeze
(
-
1
),
bbox_pred
],
axis
=-
1
)
bbox_num
=
paddle
.
to_tensor
(
bbox_pred
.
shape
[
1
],
dtype
=
'int32'
).
tile
([
bbox_pred
.
shape
[
0
]])
bbox_pred
=
bbox_pred
.
reshape
([
-
1
,
6
])
return
bbox_pred
,
bbox_num
ppdet/modeling/transformers/__init__.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.
import
detr_transformer
from
.
import
utils
from
.
import
matchers
from
.
import
position_encoding
from
.detr_transformer
import
*
from
.utils
import
*
from
.matchers
import
*
from
.position_encoding
import
*
ppdet/modeling/transformers/detr_transformer.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
import
paddle.nn
as
nn
from
paddle.nn.layer.transformer
import
_convert_attention_mask
import
paddle.nn.functional
as
F
from
ppdet.core.workspace
import
register
from
..layers
import
MultiHeadAttention
from
.position_encoding
import
PositionEmbedding
from
.utils
import
*
from
..initializer
import
*
__all__
=
[
'DETRTransformer'
]
class
TransformerEncoderLayer
(
nn
.
Layer
):
def
__init__
(
self
,
d_model
,
nhead
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
attn_dropout
=
None
,
act_dropout
=
None
,
normalize_before
=
False
):
super
(
TransformerEncoderLayer
,
self
).
__init__
()
attn_dropout
=
dropout
if
attn_dropout
is
None
else
attn_dropout
act_dropout
=
dropout
if
act_dropout
is
None
else
act_dropout
self
.
normalize_before
=
normalize_before
self
.
self_attn
=
MultiHeadAttention
(
d_model
,
nhead
,
attn_dropout
)
# Implementation of Feedforward model
self
.
linear1
=
nn
.
Linear
(
d_model
,
dim_feedforward
)
self
.
dropout
=
nn
.
Dropout
(
act_dropout
,
mode
=
"upscale_in_train"
)
self
.
linear2
=
nn
.
Linear
(
dim_feedforward
,
d_model
)
self
.
norm1
=
nn
.
LayerNorm
(
d_model
)
self
.
norm2
=
nn
.
LayerNorm
(
d_model
)
self
.
dropout1
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
dropout2
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
activation
=
getattr
(
F
,
activation
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
linear_init_
(
self
.
linear1
)
linear_init_
(
self
.
linear2
)
@
staticmethod
def
with_pos_embed
(
tensor
,
pos_embed
):
return
tensor
if
pos_embed
is
None
else
tensor
+
pos_embed
def
forward
(
self
,
src
,
src_mask
=
None
,
pos_embed
=
None
):
src_mask
=
_convert_attention_mask
(
src_mask
,
src
.
dtype
)
residual
=
src
if
self
.
normalize_before
:
src
=
self
.
norm1
(
src
)
q
=
k
=
self
.
with_pos_embed
(
src
,
pos_embed
)
src
=
self
.
self_attn
(
q
,
k
,
value
=
src
,
attn_mask
=
src_mask
)
src
=
residual
+
self
.
dropout1
(
src
)
if
not
self
.
normalize_before
:
src
=
self
.
norm1
(
src
)
residual
=
src
if
self
.
normalize_before
:
src
=
self
.
norm2
(
src
)
src
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
src
))))
src
=
residual
+
self
.
dropout2
(
src
)
if
not
self
.
normalize_before
:
src
=
self
.
norm2
(
src
)
return
src
class
TransformerEncoder
(
nn
.
Layer
):
def
__init__
(
self
,
encoder_layer
,
num_layers
,
norm
=
None
):
super
(
TransformerEncoder
,
self
).
__init__
()
self
.
layers
=
_get_clones
(
encoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
self
.
norm
=
norm
def
forward
(
self
,
src
,
src_mask
=
None
,
pos_embed
=
None
):
src_mask
=
_convert_attention_mask
(
src_mask
,
src
.
dtype
)
output
=
src
for
layer
in
self
.
layers
:
output
=
layer
(
output
,
src_mask
=
src_mask
,
pos_embed
=
pos_embed
)
if
self
.
norm
is
not
None
:
output
=
self
.
norm
(
output
)
return
output
class
TransformerDecoderLayer
(
nn
.
Layer
):
def
__init__
(
self
,
d_model
,
nhead
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
attn_dropout
=
None
,
act_dropout
=
None
,
normalize_before
=
False
):
super
(
TransformerDecoderLayer
,
self
).
__init__
()
attn_dropout
=
dropout
if
attn_dropout
is
None
else
attn_dropout
act_dropout
=
dropout
if
act_dropout
is
None
else
act_dropout
self
.
normalize_before
=
normalize_before
self
.
self_attn
=
MultiHeadAttention
(
d_model
,
nhead
,
attn_dropout
)
self
.
cross_attn
=
MultiHeadAttention
(
d_model
,
nhead
,
attn_dropout
)
# Implementation of Feedforward model
self
.
linear1
=
nn
.
Linear
(
d_model
,
dim_feedforward
)
self
.
dropout
=
nn
.
Dropout
(
act_dropout
,
mode
=
"upscale_in_train"
)
self
.
linear2
=
nn
.
Linear
(
dim_feedforward
,
d_model
)
self
.
norm1
=
nn
.
LayerNorm
(
d_model
)
self
.
norm2
=
nn
.
LayerNorm
(
d_model
)
self
.
norm3
=
nn
.
LayerNorm
(
d_model
)
self
.
dropout1
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
dropout2
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
dropout3
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
activation
=
getattr
(
F
,
activation
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
linear_init_
(
self
.
linear1
)
linear_init_
(
self
.
linear2
)
@
staticmethod
def
with_pos_embed
(
tensor
,
pos_embed
):
return
tensor
if
pos_embed
is
None
else
tensor
+
pos_embed
def
forward
(
self
,
tgt
,
memory
,
tgt_mask
=
None
,
memory_mask
=
None
,
pos_embed
=
None
,
query_pos_embed
=
None
):
tgt_mask
=
_convert_attention_mask
(
tgt_mask
,
tgt
.
dtype
)
memory_mask
=
_convert_attention_mask
(
memory_mask
,
memory
.
dtype
)
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm1
(
tgt
)
q
=
k
=
self
.
with_pos_embed
(
tgt
,
query_pos_embed
)
tgt
=
self
.
self_attn
(
q
,
k
,
value
=
tgt
,
attn_mask
=
tgt_mask
)
tgt
=
residual
+
self
.
dropout1
(
tgt
)
if
not
self
.
normalize_before
:
tgt
=
self
.
norm1
(
tgt
)
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm2
(
tgt
)
q
=
self
.
with_pos_embed
(
tgt
,
query_pos_embed
)
k
=
self
.
with_pos_embed
(
memory
,
pos_embed
)
tgt
=
self
.
cross_attn
(
q
,
k
,
value
=
memory
,
attn_mask
=
memory_mask
)
tgt
=
residual
+
self
.
dropout2
(
tgt
)
if
not
self
.
normalize_before
:
tgt
=
self
.
norm2
(
tgt
)
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm3
(
tgt
)
tgt
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
tgt
))))
tgt
=
residual
+
self
.
dropout3
(
tgt
)
if
not
self
.
normalize_before
:
tgt
=
self
.
norm3
(
tgt
)
return
tgt
class
TransformerDecoder
(
nn
.
Layer
):
def
__init__
(
self
,
decoder_layer
,
num_layers
,
norm
=
None
,
return_intermediate
=
False
):
super
(
TransformerDecoder
,
self
).
__init__
()
self
.
layers
=
_get_clones
(
decoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
self
.
norm
=
norm
self
.
return_intermediate
=
return_intermediate
def
forward
(
self
,
tgt
,
memory
,
tgt_mask
=
None
,
memory_mask
=
None
,
pos_embed
=
None
,
query_pos_embed
=
None
):
tgt_mask
=
_convert_attention_mask
(
tgt_mask
,
tgt
.
dtype
)
memory_mask
=
_convert_attention_mask
(
memory_mask
,
memory
.
dtype
)
output
=
tgt
intermediate
=
[]
for
layer
in
self
.
layers
:
output
=
layer
(
output
,
memory
,
tgt_mask
=
tgt_mask
,
memory_mask
=
memory_mask
,
pos_embed
=
pos_embed
,
query_pos_embed
=
query_pos_embed
)
if
self
.
return_intermediate
:
intermediate
.
append
(
self
.
norm
(
output
))
if
self
.
norm
is
not
None
:
output
=
self
.
norm
(
output
)
if
self
.
return_intermediate
:
return
paddle
.
stack
(
intermediate
)
return
output
.
unsqueeze
(
0
)
@
register
class
DETRTransformer
(
nn
.
Layer
):
__shared__
=
[
'hidden_dim'
]
def
__init__
(
self
,
num_queries
=
100
,
position_embed_type
=
'sine'
,
return_intermediate_dec
=
True
,
backbone_num_channels
=
2048
,
hidden_dim
=
256
,
nhead
=
8
,
num_encoder_layers
=
6
,
num_decoder_layers
=
6
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
attn_dropout
=
None
,
act_dropout
=
None
,
normalize_before
=
False
):
super
(
DETRTransformer
,
self
).
__init__
()
assert
position_embed_type
in
[
'sine'
,
'learned'
],
\
f
'ValueError: position_embed_type not supported
{
position_embed_type
}
!'
self
.
hidden_dim
=
hidden_dim
self
.
nhead
=
nhead
encoder_layer
=
TransformerEncoderLayer
(
hidden_dim
,
nhead
,
dim_feedforward
,
dropout
,
activation
,
attn_dropout
,
act_dropout
,
normalize_before
)
encoder_norm
=
nn
.
LayerNorm
(
hidden_dim
)
if
normalize_before
else
None
self
.
encoder
=
TransformerEncoder
(
encoder_layer
,
num_encoder_layers
,
encoder_norm
)
decoder_layer
=
TransformerDecoderLayer
(
hidden_dim
,
nhead
,
dim_feedforward
,
dropout
,
activation
,
attn_dropout
,
act_dropout
,
normalize_before
)
decoder_norm
=
nn
.
LayerNorm
(
hidden_dim
)
self
.
decoder
=
TransformerDecoder
(
decoder_layer
,
num_decoder_layers
,
decoder_norm
,
return_intermediate
=
return_intermediate_dec
)
self
.
input_proj
=
nn
.
Conv2D
(
backbone_num_channels
,
hidden_dim
,
kernel_size
=
1
)
self
.
query_pos_embed
=
nn
.
Embedding
(
num_queries
,
hidden_dim
)
self
.
position_embedding
=
PositionEmbedding
(
hidden_dim
//
2
,
normalize
=
True
if
position_embed_type
==
'sine'
else
False
,
embed_type
=
position_embed_type
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
xavier_uniform_
(
p
)
conv_init_
(
self
.
input_proj
)
normal_
(
self
.
query_pos_embed
.
weight
)
@
classmethod
def
from_config
(
cls
,
cfg
,
input_shape
):
return
{
'backbone_num_channels'
:
[
i
.
channels
for
i
in
input_shape
][
-
1
],
}
def
forward
(
self
,
src
,
src_mask
=
None
):
r
"""
Applies a Transformer model on the inputs.
Parameters:
src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
src_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
[bs, H, W]`. When the data type is bool, the unwanted positions
have `False` values and the others have `True` values. When the
data type is int, the unwanted positions have 0 values and the
others have 1 values. When the data type is float, the unwanted
positions have `-INF` values and the others have 0 values. It
can be None when nothing wanted or needed to be prevented
attention to. Default None.
Returns:
output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
memory (Tensor): [batch_size, hidden_dim, h, w]
"""
# use last level feature map
src_proj
=
self
.
input_proj
(
src
[
-
1
])
bs
,
c
,
h
,
w
=
src_proj
.
shape
# flatten [B, C, H, W] to [B, HxW, C]
src_flatten
=
src_proj
.
flatten
(
2
).
transpose
([
0
,
2
,
1
])
if
src_mask
is
not
None
:
src_mask
=
F
.
interpolate
(
src_mask
.
unsqueeze
(
0
).
astype
(
src_flatten
.
dtype
),
size
=
(
h
,
w
))[
0
].
astype
(
'bool'
)
else
:
src_mask
=
paddle
.
ones
([
bs
,
h
,
w
],
dtype
=
'bool'
)
pos_embed
=
self
.
position_embedding
(
src_mask
).
flatten
(
2
).
transpose
(
[
0
,
2
,
1
])
src_mask
=
_convert_attention_mask
(
src_mask
,
src_flatten
.
dtype
)
src_mask
=
src_mask
.
reshape
([
bs
,
1
,
1
,
-
1
])
memory
=
self
.
encoder
(
src_flatten
,
src_mask
=
src_mask
,
pos_embed
=
pos_embed
)
query_pos_embed
=
self
.
query_pos_embed
.
weight
.
unsqueeze
(
0
).
tile
(
[
bs
,
1
,
1
])
tgt
=
paddle
.
zeros_like
(
query_pos_embed
)
output
=
self
.
decoder
(
tgt
,
memory
,
memory_mask
=
src_mask
,
pos_embed
=
pos_embed
,
query_pos_embed
=
query_pos_embed
)
return
(
output
,
memory
.
transpose
([
0
,
2
,
1
]).
reshape
([
bs
,
c
,
h
,
w
]),
src_proj
,
src_mask
.
reshape
([
bs
,
1
,
1
,
h
,
w
]))
ppdet/modeling/transformers/matchers.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
scipy.optimize
import
linear_sum_assignment
from
ppdet.core.workspace
import
register
,
serializable
from
..losses.iou_loss
import
GIoULoss
from
.utils
import
bbox_cxcywh_to_xyxy
__all__
=
[
'HungarianMatcher'
]
@
register
@
serializable
class
HungarianMatcher
(
nn
.
Layer
):
__shared__
=
[
'use_focal_loss'
]
def
__init__
(
self
,
matcher_coeff
=
{
'class'
:
1
,
'bbox'
:
5
,
'giou'
:
2
},
use_focal_loss
=
False
,
alpha
=
0.25
,
gamma
=
2.0
):
r
"""
Args:
matcher_coeff (dict): The coefficient of hungarian matcher cost.
"""
super
(
HungarianMatcher
,
self
).
__init__
()
self
.
matcher_coeff
=
matcher_coeff
self
.
use_focal_loss
=
use_focal_loss
self
.
alpha
=
alpha
self
.
gamma
=
gamma
self
.
giou_loss
=
GIoULoss
()
def
forward
(
self
,
boxes
,
logits
,
gt_bbox
,
gt_class
):
r
"""
Args:
boxes (Tensor): [b, query, 4]
logits (Tensor): [b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
bs
,
num_queries
=
boxes
.
shape
[:
2
]
num_gts
=
sum
(
len
(
a
)
for
a
in
gt_class
)
if
num_gts
==
0
:
return
[(
paddle
.
to_tensor
(
[],
dtype
=
paddle
.
int64
),
paddle
.
to_tensor
(
[],
dtype
=
paddle
.
int64
))
for
_
in
range
(
bs
)]
# We flatten to compute the cost matrices in a batch
# [batch_size * num_queries, num_classes]
out_prob
=
F
.
sigmoid
(
logits
.
flatten
(
0
,
1
))
if
self
.
use_focal_loss
else
F
.
softmax
(
logits
.
flatten
(
0
,
1
))
# [batch_size * num_queries, 4]
out_bbox
=
boxes
.
flatten
(
0
,
1
)
# Also concat the target labels and boxes
tgt_ids
=
paddle
.
concat
(
gt_class
).
flatten
()
tgt_bbox
=
paddle
.
concat
(
gt_bbox
)
# Compute the classification cost
if
self
.
use_focal_loss
:
neg_cost_class
=
(
1
-
self
.
alpha
)
*
(
out_prob
**
self
.
gamma
)
*
(
-
(
1
-
out_prob
+
1e-8
).
log
())
pos_cost_class
=
self
.
alpha
*
(
(
1
-
out_prob
)
**
self
.
gamma
)
*
(
-
(
out_prob
+
1e-8
).
log
())
cost_class
=
paddle
.
gather
(
pos_cost_class
,
tgt_ids
,
axis
=
1
)
-
paddle
.
gather
(
neg_cost_class
,
tgt_ids
,
axis
=
1
)
else
:
cost_class
=
-
paddle
.
gather
(
out_prob
,
tgt_ids
,
axis
=
1
)
# Compute the L1 cost between boxes
cost_bbox
=
(
out_bbox
.
unsqueeze
(
1
)
-
tgt_bbox
.
unsqueeze
(
0
)).
abs
().
sum
(
-
1
)
# Compute the giou cost betwen boxes
cost_giou
=
self
.
giou_loss
(
bbox_cxcywh_to_xyxy
(
out_bbox
.
unsqueeze
(
1
)),
bbox_cxcywh_to_xyxy
(
tgt_bbox
.
unsqueeze
(
0
))).
squeeze
(
-
1
)
# Final cost matrix
C
=
self
.
matcher_coeff
[
'class'
]
*
cost_class
+
self
.
matcher_coeff
[
'bbox'
]
*
cost_bbox
+
\
self
.
matcher_coeff
[
'giou'
]
*
cost_giou
C
=
C
.
reshape
([
bs
,
num_queries
,
-
1
])
C
=
[
a
.
squeeze
(
0
)
for
a
in
C
.
chunk
(
bs
)]
sizes
=
[
a
.
shape
[
0
]
for
a
in
gt_bbox
]
indices
=
[
linear_sum_assignment
(
c
.
split
(
sizes
,
-
1
)[
i
].
numpy
())
for
i
,
c
in
enumerate
(
C
)
]
return
[(
paddle
.
to_tensor
(
i
,
dtype
=
paddle
.
int64
),
paddle
.
to_tensor
(
j
,
dtype
=
paddle
.
int64
))
for
i
,
j
in
indices
]
ppdet/modeling/transformers/position_encoding.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
import
paddle.nn
as
nn
from
ppdet.core.workspace
import
register
,
serializable
@
register
@
serializable
class
PositionEmbedding
(
nn
.
Layer
):
def
__init__
(
self
,
num_pos_feats
=
128
,
temperature
=
10000
,
normalize
=
True
,
scale
=
None
,
embed_type
=
'sine'
,
num_embeddings
=
50
):
super
(
PositionEmbedding
,
self
).
__init__
()
assert
embed_type
in
[
'sine'
,
'learned'
]
self
.
embed_type
=
embed_type
if
self
.
embed_type
==
'sine'
:
self
.
num_pos_feats
=
num_pos_feats
self
.
temperature
=
temperature
self
.
normalize
=
normalize
if
scale
is
not
None
and
normalize
is
False
:
raise
ValueError
(
"normalize should be True if scale is passed"
)
if
scale
is
None
:
scale
=
2
*
math
.
pi
self
.
scale
=
scale
elif
self
.
embed_type
==
'learned'
:
self
.
row_embed
=
nn
.
Embedding
(
num_embeddings
,
num_pos_feats
)
self
.
col_embed
=
nn
.
Embedding
(
num_embeddings
,
num_pos_feats
)
else
:
raise
ValueError
(
f
"not supported
{
self
.
embed_type
}
"
)
def
forward
(
self
,
mask
):
"""
Args:
mask (Tensor): [B, H, W]
Returns:
pos (Tensor): [B, C, H, W]
"""
assert
mask
.
dtype
==
paddle
.
bool
if
self
.
embed_type
==
'sine'
:
mask
=
mask
.
astype
(
'float32'
)
y_embed
=
mask
.
cumsum
(
1
,
dtype
=
'float32'
)
x_embed
=
mask
.
cumsum
(
2
,
dtype
=
'float32'
)
if
self
.
normalize
:
eps
=
1e-6
y_embed
=
y_embed
/
(
y_embed
[:,
-
1
:,
:]
+
eps
)
*
self
.
scale
x_embed
=
x_embed
/
(
x_embed
[:,
:,
-
1
:]
+
eps
)
*
self
.
scale
dim_t
=
2
*
(
paddle
.
arange
(
self
.
num_pos_feats
)
//
2
).
astype
(
'float32'
)
dim_t
=
self
.
temperature
**
(
dim_t
/
self
.
num_pos_feats
)
pos_x
=
x_embed
.
unsqueeze
(
-
1
)
/
dim_t
pos_y
=
y_embed
.
unsqueeze
(
-
1
)
/
dim_t
pos_x
=
paddle
.
stack
(
(
pos_x
[:,
:,
:,
0
::
2
].
sin
(),
pos_x
[:,
:,
:,
1
::
2
].
cos
()),
axis
=
4
).
flatten
(
3
)
pos_y
=
paddle
.
stack
(
(
pos_y
[:,
:,
:,
0
::
2
].
sin
(),
pos_y
[:,
:,
:,
1
::
2
].
cos
()),
axis
=
4
).
flatten
(
3
)
pos
=
paddle
.
concat
((
pos_y
,
pos_x
),
axis
=
3
).
transpose
([
0
,
3
,
1
,
2
])
return
pos
elif
self
.
embed_type
==
'learned'
:
h
,
w
=
mask
.
shape
[
-
2
:]
i
=
paddle
.
arange
(
w
)
j
=
paddle
.
arange
(
h
)
x_emb
=
self
.
col_embed
(
i
)
y_emb
=
self
.
row_embed
(
j
)
pos
=
paddle
.
concat
(
[
x_emb
.
unsqueeze
(
0
).
repeat
(
h
,
1
,
1
),
y_emb
.
unsqueeze
(
1
).
repeat
(
1
,
w
,
1
),
],
axis
=-
1
).
transpose
([
2
,
0
,
1
]).
unsqueeze
(
0
).
tile
(
mask
.
shape
[
0
],
1
,
1
,
1
)
return
pos
else
:
raise
ValueError
(
f
"not supported
{
self
.
embed_type
}
"
)
ppdet/modeling/transformers/utils.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
copy
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
..bbox_utils
import
bbox_overlaps
__all__
=
[
'_get_clones'
,
'bbox_overlaps'
,
'bbox_cxcywh_to_xyxy'
,
'bbox_xyxy_to_cxcywh'
,
'sigmoid_focal_loss'
]
def
_get_clones
(
module
,
N
):
return
nn
.
LayerList
([
copy
.
deepcopy
(
module
)
for
_
in
range
(
N
)])
def
bbox_cxcywh_to_xyxy
(
x
):
x_c
,
y_c
,
w
,
h
=
x
.
unbind
(
-
1
)
b
=
[(
x_c
-
0.5
*
w
),
(
y_c
-
0.5
*
h
),
(
x_c
+
0.5
*
w
),
(
y_c
+
0.5
*
h
)]
return
paddle
.
stack
(
b
,
axis
=-
1
)
def
bbox_xyxy_to_cxcywh
(
x
):
x0
,
y0
,
x1
,
y1
=
x
.
unbind
(
-
1
)
b
=
[(
x0
+
x1
)
/
2
,
(
y0
+
y1
)
/
2
,
(
x1
-
x0
),
(
y1
-
y0
)]
return
paddle
.
stack
(
b
,
axis
=-
1
)
def
sigmoid_focal_loss
(
logit
,
label
,
normalizer
=
1.0
,
alpha
=
0.25
,
gamma
=
2.0
):
prob
=
F
.
sigmoid
(
logit
)
ce_loss
=
F
.
binary_cross_entropy_with_logits
(
logit
,
label
,
reduction
=
"none"
)
p_t
=
prob
*
label
+
(
1
-
prob
)
*
(
1
-
label
)
loss
=
ce_loss
*
((
1
-
p_t
)
**
gamma
)
if
alpha
>=
0
:
alpha_t
=
alpha
*
label
+
(
1
-
alpha
)
*
(
1
-
label
)
loss
=
alpha_t
*
loss
return
loss
.
mean
(
1
).
sum
()
/
normalizer
if
normalizer
>
1.
else
loss
.
mean
(
1
).
sum
()
ppdet/optimizer.py
浏览文件 @
3c1c576d
...
...
@@ -244,10 +244,11 @@ class OptimizerBuilder():
optim_args
=
self
.
optimizer
.
copy
()
optim_type
=
optim_args
[
'type'
]
del
optim_args
[
'type'
]
if
optim_type
!=
'AdamW'
:
optim_args
[
'weight_decay'
]
=
regularization
op
=
getattr
(
optimizer
,
optim_type
)
return
op
(
learning_rate
=
learning_rate
,
parameters
=
params
,
weight_decay
=
regularization
,
grad_clip
=
grad_clip
,
**
optim_args
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录