Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
3c1c576d
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3c1c576d
编写于
7月 07, 2021
作者:
S
shangliang Xu
提交者:
GitHub
7月 07, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Transformer] Add transformer base code (#3612)
* Add DETR * drop return_pad_mask in PadBatch
上级
43515234
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
1882 addition
and
12 deletion
+1882
-12
ppdet/data/transform/operators.py
ppdet/data/transform/operators.py
+353
-1
ppdet/modeling/__init__.py
ppdet/modeling/__init__.py
+2
-0
ppdet/modeling/architectures/__init__.py
ppdet/modeling/architectures/__init__.py
+2
-0
ppdet/modeling/architectures/detr.py
ppdet/modeling/architectures/detr.py
+93
-0
ppdet/modeling/heads/__init__.py
ppdet/modeling/heads/__init__.py
+2
-0
ppdet/modeling/heads/detr_head.py
ppdet/modeling/heads/detr_head.py
+278
-0
ppdet/modeling/initializer.py
ppdet/modeling/initializer.py
+18
-4
ppdet/modeling/layers.py
ppdet/modeling/layers.py
+179
-0
ppdet/modeling/losses/__init__.py
ppdet/modeling/losses/__init__.py
+2
-0
ppdet/modeling/losses/detr_loss.py
ppdet/modeling/losses/detr_loss.py
+230
-0
ppdet/modeling/post_process.py
ppdet/modeling/post_process.py
+65
-6
ppdet/modeling/transformers/__init__.py
ppdet/modeling/transformers/__init__.py
+23
-0
ppdet/modeling/transformers/detr_transformer.py
ppdet/modeling/transformers/detr_transformer.py
+351
-0
ppdet/modeling/transformers/matchers.py
ppdet/modeling/transformers/matchers.py
+123
-0
ppdet/modeling/transformers/position_encoding.py
ppdet/modeling/transformers/position_encoding.py
+101
-0
ppdet/modeling/transformers/utils.py
ppdet/modeling/transformers/utils.py
+58
-0
ppdet/optimizer.py
ppdet/optimizer.py
+2
-1
未找到文件。
ppdet/data/transform/operators.py
浏览文件 @
3c1c576d
...
...
@@ -40,6 +40,7 @@ from PIL import Image, ImageEnhance, ImageDraw
from
ppdet.core.workspace
import
serializable
from
ppdet.modeling.layers
import
AnchorGrid
from
ppdet.modeling
import
bbox_utils
from
..reader
import
Compose
from
.op_helper
import
(
satisfy_sample_constraint
,
filter_and_process
,
generate_sample_bbox
,
clip_bbox
,
data_anchor_sampling
,
...
...
@@ -2348,7 +2349,7 @@ class RandomResizeCrop(BaseOperator):
for
gt_segm
in
sample
[
'gt_segm'
]
]
sample
[
'gt_segm'
]
=
np
.
asarray
(
masks
).
astype
(
np
.
uint8
)
return
sample
...
...
@@ -2528,3 +2529,354 @@ class Mosaic(BaseOperator):
sample
[
'difficult'
]
=
difficult
return
sample
@
register_op
class
RandomSelect
(
BaseOperator
):
"""
Randomly choose a transformation between transforms1 and transforms2,
and the probability of choosing transforms1 is p.
"""
def
__init__
(
self
,
transforms1
,
transforms2
,
p
=
0.5
):
super
(
RandomSelect
,
self
).
__init__
()
self
.
transforms1
=
Compose
(
transforms1
)
self
.
transforms2
=
Compose
(
transforms2
)
self
.
p
=
p
def
apply
(
self
,
sample
,
context
=
None
):
if
random
.
random
()
<
self
.
p
:
return
self
.
transforms1
(
sample
)
return
self
.
transforms2
(
sample
)
@
register_op
class
RandomShortSideResize
(
BaseOperator
):
def
__init__
(
self
,
short_side_sizes
,
max_size
=
None
,
interp
=
cv2
.
INTER_LINEAR
,
random_interp
=
False
):
"""
Resize the image randomly according to the short side. If max_size is not None,
the long side is scaled according to max_size. The whole process will be keep ratio.
Args:
short_side_sizes (list|tuple): Image target short side size.
max_size (int): The size of the longest side of image after resize.
interp (int): The interpolation method.
random_interp (bool): Whether random select interpolation method.
"""
super
(
RandomShortSideResize
,
self
).
__init__
()
assert
isinstance
(
short_side_sizes
,
Sequence
),
"short_side_sizes must be List or Tuple"
self
.
short_side_sizes
=
short_side_sizes
self
.
max_size
=
max_size
self
.
interp
=
interp
self
.
random_interp
=
random_interp
self
.
interps
=
[
cv2
.
INTER_NEAREST
,
cv2
.
INTER_LINEAR
,
cv2
.
INTER_AREA
,
cv2
.
INTER_CUBIC
,
cv2
.
INTER_LANCZOS4
,
]
def
get_size_with_aspect_ratio
(
self
,
image_shape
,
size
,
max_size
=
None
):
h
,
w
=
image_shape
if
max_size
is
not
None
:
min_original_size
=
float
(
min
((
w
,
h
)))
max_original_size
=
float
(
max
((
w
,
h
)))
if
max_original_size
/
min_original_size
*
size
>
max_size
:
size
=
int
(
round
(
max_size
*
min_original_size
/
max_original_size
))
if
(
w
<=
h
and
w
==
size
)
or
(
h
<=
w
and
h
==
size
):
return
(
w
,
h
)
if
w
<
h
:
ow
=
size
oh
=
int
(
size
*
h
/
w
)
else
:
oh
=
size
ow
=
int
(
size
*
w
/
h
)
return
(
ow
,
oh
)
def
resize
(
self
,
sample
,
target_size
,
max_size
=
None
,
interp
=
cv2
.
INTER_LINEAR
):
im
=
sample
[
'image'
]
if
not
isinstance
(
im
,
np
.
ndarray
):
raise
TypeError
(
"{}: image type is not numpy."
.
format
(
self
))
if
len
(
im
.
shape
)
!=
3
:
raise
ImageError
(
'{}: image is not 3-dimensional.'
.
format
(
self
))
target_size
=
self
.
get_size_with_aspect_ratio
(
im
.
shape
[:
2
],
target_size
,
max_size
)
im_scale_y
,
im_scale_x
=
target_size
[
1
]
/
im
.
shape
[
0
],
target_size
[
0
]
/
im
.
shape
[
1
]
sample
[
'image'
]
=
cv2
.
resize
(
im
,
target_size
,
interpolation
=
interp
)
sample
[
'im_shape'
]
=
np
.
asarray
(
target_size
[::
-
1
],
dtype
=
np
.
float32
)
if
'scale_factor'
in
sample
:
scale_factor
=
sample
[
'scale_factor'
]
sample
[
'scale_factor'
]
=
np
.
asarray
(
[
scale_factor
[
0
]
*
im_scale_y
,
scale_factor
[
1
]
*
im_scale_x
],
dtype
=
np
.
float32
)
else
:
sample
[
'scale_factor'
]
=
np
.
asarray
(
[
im_scale_y
,
im_scale_x
],
dtype
=
np
.
float32
)
# apply bbox
if
'gt_bbox'
in
sample
and
len
(
sample
[
'gt_bbox'
])
>
0
:
sample
[
'gt_bbox'
]
=
self
.
apply_bbox
(
sample
[
'gt_bbox'
],
[
im_scale_x
,
im_scale_y
],
target_size
)
# apply polygon
if
'gt_poly'
in
sample
and
len
(
sample
[
'gt_poly'
])
>
0
:
sample
[
'gt_poly'
]
=
self
.
apply_segm
(
sample
[
'gt_poly'
],
im
.
shape
[:
2
],
[
im_scale_x
,
im_scale_y
])
# apply semantic
if
'semantic'
in
sample
and
sample
[
'semantic'
]:
semantic
=
sample
[
'semantic'
]
semantic
=
cv2
.
resize
(
semantic
.
astype
(
'float32'
),
target_size
,
interpolation
=
self
.
interp
)
semantic
=
np
.
asarray
(
semantic
).
astype
(
'int32'
)
semantic
=
np
.
expand_dims
(
semantic
,
0
)
sample
[
'semantic'
]
=
semantic
# apply gt_segm
if
'gt_segm'
in
sample
and
len
(
sample
[
'gt_segm'
])
>
0
:
masks
=
[
cv2
.
resize
(
gt_segm
,
target_size
,
interpolation
=
cv2
.
INTER_NEAREST
)
for
gt_segm
in
sample
[
'gt_segm'
]
]
sample
[
'gt_segm'
]
=
np
.
asarray
(
masks
).
astype
(
np
.
uint8
)
return
sample
def
apply_bbox
(
self
,
bbox
,
scale
,
size
):
im_scale_x
,
im_scale_y
=
scale
resize_w
,
resize_h
=
size
bbox
[:,
0
::
2
]
*=
im_scale_x
bbox
[:,
1
::
2
]
*=
im_scale_y
bbox
[:,
0
::
2
]
=
np
.
clip
(
bbox
[:,
0
::
2
],
0
,
resize_w
)
bbox
[:,
1
::
2
]
=
np
.
clip
(
bbox
[:,
1
::
2
],
0
,
resize_h
)
return
bbox
.
astype
(
'float32'
)
def
apply_segm
(
self
,
segms
,
im_size
,
scale
):
def
_resize_poly
(
poly
,
im_scale_x
,
im_scale_y
):
resized_poly
=
np
.
array
(
poly
).
astype
(
'float32'
)
resized_poly
[
0
::
2
]
*=
im_scale_x
resized_poly
[
1
::
2
]
*=
im_scale_y
return
resized_poly
.
tolist
()
def
_resize_rle
(
rle
,
im_h
,
im_w
,
im_scale_x
,
im_scale_y
):
if
'counts'
in
rle
and
type
(
rle
[
'counts'
])
==
list
:
rle
=
mask_util
.
frPyObjects
(
rle
,
im_h
,
im_w
)
mask
=
mask_util
.
decode
(
rle
)
mask
=
cv2
.
resize
(
mask
,
None
,
None
,
fx
=
im_scale_x
,
fy
=
im_scale_y
,
interpolation
=
self
.
interp
)
rle
=
mask_util
.
encode
(
np
.
array
(
mask
,
order
=
'F'
,
dtype
=
np
.
uint8
))
return
rle
im_h
,
im_w
=
im_size
im_scale_x
,
im_scale_y
=
scale
resized_segms
=
[]
for
segm
in
segms
:
if
is_poly
(
segm
):
# Polygon format
resized_segms
.
append
([
_resize_poly
(
poly
,
im_scale_x
,
im_scale_y
)
for
poly
in
segm
])
else
:
# RLE format
import
pycocotools.mask
as
mask_util
resized_segms
.
append
(
_resize_rle
(
segm
,
im_h
,
im_w
,
im_scale_x
,
im_scale_y
))
return
resized_segms
def
apply
(
self
,
sample
,
context
=
None
):
target_size
=
random
.
choice
(
self
.
short_side_sizes
)
interp
=
random
.
choice
(
self
.
interps
)
if
self
.
random_interp
else
self
.
interp
return
self
.
resize
(
sample
,
target_size
,
self
.
max_size
,
interp
)
@
register_op
class
RandomSizeCrop
(
BaseOperator
):
"""
Cut the image randomly according to `min_size` and `max_size`
"""
def
__init__
(
self
,
min_size
,
max_size
):
super
(
RandomSizeCrop
,
self
).
__init__
()
self
.
min_size
=
min_size
self
.
max_size
=
max_size
from
paddle.vision.transforms.functional
import
crop
as
paddle_crop
self
.
paddle_crop
=
paddle_crop
@
staticmethod
def
get_crop_params
(
img_shape
,
output_size
):
"""Get parameters for ``crop`` for a random crop.
Args:
img_shape (list|tuple): Image's height and width.
output_size (list|tuple): Expected output size of the crop.
Returns:
tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
"""
h
,
w
=
img_shape
th
,
tw
=
output_size
if
h
+
1
<
th
or
w
+
1
<
tw
:
raise
ValueError
(
"Required crop size {} is larger then input image size {}"
.
format
((
th
,
tw
),
(
h
,
w
)))
if
w
==
tw
and
h
==
th
:
return
0
,
0
,
h
,
w
i
=
random
.
randint
(
0
,
h
-
th
+
1
)
j
=
random
.
randint
(
0
,
w
-
tw
+
1
)
return
i
,
j
,
th
,
tw
def
crop
(
self
,
sample
,
region
):
image_shape
=
sample
[
'image'
].
shape
[:
2
]
sample
[
'image'
]
=
self
.
paddle_crop
(
sample
[
'image'
],
*
region
)
keep_index
=
None
# apply bbox
if
'gt_bbox'
in
sample
and
len
(
sample
[
'gt_bbox'
])
>
0
:
sample
[
'gt_bbox'
]
=
self
.
apply_bbox
(
sample
[
'gt_bbox'
],
region
)
bbox
=
sample
[
'gt_bbox'
].
reshape
([
-
1
,
2
,
2
])
area
=
(
bbox
[:,
1
,
:]
-
bbox
[:,
0
,
:]).
prod
(
axis
=
1
)
keep_index
=
np
.
where
(
area
>
0
)[
0
]
sample
[
'gt_bbox'
]
=
sample
[
'gt_bbox'
][
keep_index
]
if
len
(
keep_index
)
>
0
else
np
.
zeros
(
[
0
,
4
],
dtype
=
np
.
float32
)
sample
[
'gt_class'
]
=
sample
[
'gt_class'
][
keep_index
]
if
len
(
keep_index
)
>
0
else
np
.
zeros
(
[
0
,
1
],
dtype
=
np
.
float32
)
if
'gt_score'
in
sample
:
sample
[
'gt_score'
]
=
sample
[
'gt_score'
][
keep_index
]
if
len
(
keep_index
)
>
0
else
np
.
zeros
(
[
0
,
1
],
dtype
=
np
.
float32
)
if
'is_crowd'
in
sample
:
sample
[
'is_crowd'
]
=
sample
[
'is_crowd'
][
keep_index
]
if
len
(
keep_index
)
>
0
else
np
.
zeros
(
[
0
,
1
],
dtype
=
np
.
float32
)
# apply polygon
if
'gt_poly'
in
sample
and
len
(
sample
[
'gt_poly'
])
>
0
:
sample
[
'gt_poly'
]
=
self
.
apply_segm
(
sample
[
'gt_poly'
],
region
,
image_shape
)
if
keep_index
is
not
None
:
sample
[
'gt_poly'
]
=
sample
[
'gt_poly'
][
keep_index
]
# apply gt_segm
if
'gt_segm'
in
sample
and
len
(
sample
[
'gt_segm'
])
>
0
:
i
,
j
,
h
,
w
=
region
sample
[
'gt_segm'
]
=
sample
[
'gt_segm'
][:,
i
:
i
+
h
,
j
:
j
+
w
]
if
keep_index
is
not
None
:
sample
[
'gt_segm'
]
=
sample
[
'gt_segm'
][
keep_index
]
return
sample
def
apply_bbox
(
self
,
bbox
,
region
):
i
,
j
,
h
,
w
=
region
region_size
=
np
.
asarray
([
w
,
h
])
crop_bbox
=
bbox
-
np
.
asarray
([
j
,
i
,
j
,
i
])
crop_bbox
=
np
.
minimum
(
crop_bbox
.
reshape
([
-
1
,
2
,
2
]),
region_size
)
crop_bbox
=
crop_bbox
.
clip
(
min
=
0
)
return
crop_bbox
.
reshape
([
-
1
,
4
]).
astype
(
'float32'
)
def
apply_segm
(
self
,
segms
,
region
,
image_shape
):
def
_crop_poly
(
segm
,
crop
):
xmin
,
ymin
,
xmax
,
ymax
=
crop
crop_coord
=
[
xmin
,
ymin
,
xmin
,
ymax
,
xmax
,
ymax
,
xmax
,
ymin
]
crop_p
=
np
.
array
(
crop_coord
).
reshape
(
4
,
2
)
crop_p
=
Polygon
(
crop_p
)
crop_segm
=
list
()
for
poly
in
segm
:
poly
=
np
.
array
(
poly
).
reshape
(
len
(
poly
)
//
2
,
2
)
polygon
=
Polygon
(
poly
)
if
not
polygon
.
is_valid
:
exterior
=
polygon
.
exterior
multi_lines
=
exterior
.
intersection
(
exterior
)
polygons
=
shapely
.
ops
.
polygonize
(
multi_lines
)
polygon
=
MultiPolygon
(
polygons
)
multi_polygon
=
list
()
if
isinstance
(
polygon
,
MultiPolygon
):
multi_polygon
=
copy
.
deepcopy
(
polygon
)
else
:
multi_polygon
.
append
(
copy
.
deepcopy
(
polygon
))
for
per_polygon
in
multi_polygon
:
inter
=
per_polygon
.
intersection
(
crop_p
)
if
not
inter
:
continue
if
isinstance
(
inter
,
(
MultiPolygon
,
GeometryCollection
)):
for
part
in
inter
:
if
not
isinstance
(
part
,
Polygon
):
continue
part
=
np
.
squeeze
(
np
.
array
(
part
.
exterior
.
coords
[:
-
1
]).
reshape
(
1
,
-
1
))
part
[
0
::
2
]
-=
xmin
part
[
1
::
2
]
-=
ymin
crop_segm
.
append
(
part
.
tolist
())
elif
isinstance
(
inter
,
Polygon
):
crop_poly
=
np
.
squeeze
(
np
.
array
(
inter
.
exterior
.
coords
[:
-
1
]).
reshape
(
1
,
-
1
))
crop_poly
[
0
::
2
]
-=
xmin
crop_poly
[
1
::
2
]
-=
ymin
crop_segm
.
append
(
crop_poly
.
tolist
())
else
:
continue
return
crop_segm
def
_crop_rle
(
rle
,
crop
,
height
,
width
):
if
'counts'
in
rle
and
type
(
rle
[
'counts'
])
==
list
:
rle
=
mask_util
.
frPyObjects
(
rle
,
height
,
width
)
mask
=
mask_util
.
decode
(
rle
)
mask
=
mask
[
crop
[
1
]:
crop
[
3
],
crop
[
0
]:
crop
[
2
]]
rle
=
mask_util
.
encode
(
np
.
array
(
mask
,
order
=
'F'
,
dtype
=
np
.
uint8
))
return
rle
i
,
j
,
h
,
w
=
region
crop
=
[
j
,
i
,
j
+
w
,
i
+
h
]
height
,
width
=
image_shape
crop_segms
=
[]
for
segm
in
segms
:
if
is_poly
(
segm
):
import
copy
import
shapely.ops
from
shapely.geometry
import
Polygon
,
MultiPolygon
,
GeometryCollection
# Polygon format
crop_segms
.
append
(
_crop_poly
(
segm
,
crop
))
else
:
# RLE format
import
pycocotools.mask
as
mask_util
crop_segms
.
append
(
_crop_rle
(
segm
,
crop
,
height
,
width
))
return
crop_segms
def
apply
(
self
,
sample
,
context
=
None
):
h
=
random
.
randint
(
self
.
min_size
,
min
(
sample
[
'image'
].
shape
[
0
],
self
.
max_size
))
w
=
random
.
randint
(
self
.
min_size
,
min
(
sample
[
'image'
].
shape
[
1
],
self
.
max_size
))
region
=
self
.
get_crop_params
(
sample
[
'image'
].
shape
[:
2
],
[
h
,
w
])
return
self
.
crop
(
sample
,
region
)
ppdet/modeling/__init__.py
浏览文件 @
3c1c576d
...
...
@@ -27,6 +27,7 @@ from . import post_process
from
.
import
layers
from
.
import
reid
from
.
import
mot
from
.
import
transformers
from
.ops
import
*
from
.backbones
import
*
...
...
@@ -39,3 +40,4 @@ from .post_process import *
from
.layers
import
*
from
.reid
import
*
from
.mot
import
*
from
.transformers
import
*
ppdet/modeling/architectures/__init__.py
浏览文件 @
3c1c576d
...
...
@@ -21,6 +21,7 @@ from . import jde
from
.
import
deepsort
from
.
import
fairmot
from
.
import
centernet
from
.
import
detr
from
.meta_arch
import
*
from
.faster_rcnn
import
*
...
...
@@ -39,3 +40,4 @@ from .deepsort import *
from
.fairmot
import
*
from
.centernet
import
*
from
.blazeface
import
*
from
.detr
import
*
ppdet/modeling/architectures/detr.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
.meta_arch
import
BaseArch
from
ppdet.core.workspace
import
register
,
create
__all__
=
[
'DETR'
]
@
register
class
DETR
(
BaseArch
):
__category__
=
'architecture'
__inject__
=
[
'post_process'
]
def
__init__
(
self
,
backbone
,
transformer
,
detr_head
,
post_process
=
'DETRBBoxPostProcess'
):
super
(
DETR
,
self
).
__init__
()
self
.
backbone
=
backbone
self
.
transformer
=
transformer
self
.
detr_head
=
detr_head
self
.
post_process
=
post_process
@
classmethod
def
from_config
(
cls
,
cfg
,
*
args
,
**
kwargs
):
# backbone
backbone
=
create
(
cfg
[
'backbone'
])
# transformer
kwargs
=
{
'input_shape'
:
backbone
.
out_shape
}
transformer
=
create
(
cfg
[
'transformer'
],
**
kwargs
)
# head
kwargs
=
{
'hidden_dim'
:
transformer
.
hidden_dim
,
'nhead'
:
transformer
.
nhead
,
'input_shape'
:
backbone
.
out_shape
}
detr_head
=
create
(
cfg
[
'detr_head'
],
**
kwargs
)
return
{
'backbone'
:
backbone
,
'transformer'
:
transformer
,
"detr_head"
:
detr_head
,
}
def
_forward
(
self
):
# Backbone
body_feats
=
self
.
backbone
(
self
.
inputs
)
# Transformer
out_transformer
=
self
.
transformer
(
body_feats
,
self
.
inputs
[
'pad_mask'
])
# DETR Head
if
self
.
training
:
return
self
.
detr_head
(
out_transformer
,
body_feats
,
self
.
inputs
)
else
:
preds
=
self
.
detr_head
(
out_transformer
,
body_feats
)
bbox
,
bbox_num
=
self
.
post_process
(
preds
,
self
.
inputs
[
'im_shape'
],
self
.
inputs
[
'scale_factor'
])
return
bbox
,
bbox_num
def
get_loss
(
self
,
):
losses
=
self
.
_forward
()
losses
.
update
({
'loss'
:
paddle
.
add_n
([
v
for
k
,
v
in
losses
.
items
()
if
'log'
not
in
k
])
})
return
losses
def
get_pred
(
self
):
bbox_pred
,
bbox_num
=
self
.
_forward
()
output
=
{
"bbox"
:
bbox_pred
,
"bbox_num"
:
bbox_num
,
}
return
output
ppdet/modeling/heads/__init__.py
浏览文件 @
3c1c576d
...
...
@@ -25,6 +25,7 @@ from . import face_head
from
.
import
s2anet_head
from
.
import
keypoint_hrhrnet_head
from
.
import
centernet_head
from
.
import
detr_head
from
.bbox_head
import
*
from
.mask_head
import
*
...
...
@@ -39,3 +40,4 @@ from .face_head import *
from
.s2anet_head
import
*
from
.keypoint_hrhrnet_head
import
*
from
.centernet_head
import
*
from
.detr_head
import
*
ppdet/modeling/heads/detr_head.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
ppdet.core.workspace
import
register
import
pycocotools.mask
as
mask_util
from
..initializer
import
*
__all__
=
[
'DETRHead'
]
class
MLP
(
nn
.
Layer
):
def
__init__
(
self
,
input_dim
,
hidden_dim
,
output_dim
,
num_layers
):
super
().
__init__
()
self
.
num_layers
=
num_layers
h
=
[
hidden_dim
]
*
(
num_layers
-
1
)
self
.
layers
=
nn
.
LayerList
(
nn
.
Linear
(
n
,
k
)
for
n
,
k
in
zip
([
input_dim
]
+
h
,
h
+
[
output_dim
]))
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
for
l
in
self
.
layers
:
linear_init_
(
l
)
def
forward
(
self
,
x
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
x
=
F
.
relu
(
layer
(
x
))
if
i
<
self
.
num_layers
-
1
else
layer
(
x
)
return
x
class
MultiHeadAttentionMap
(
nn
.
Layer
):
"""This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
def
__init__
(
self
,
query_dim
,
hidden_dim
,
num_heads
,
dropout
=
0.0
,
bias
=
True
):
super
().
__init__
()
self
.
num_heads
=
num_heads
self
.
hidden_dim
=
hidden_dim
self
.
dropout
=
nn
.
Dropout
(
dropout
)
weight_attr
=
paddle
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
XavierUniform
())
bias_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
())
if
bias
else
False
self
.
q_proj
=
nn
.
Linear
(
query_dim
,
hidden_dim
,
weight_attr
,
bias_attr
)
self
.
k_proj
=
nn
.
Conv2D
(
query_dim
,
hidden_dim
,
1
,
weight_attr
=
weight_attr
,
bias_attr
=
bias_attr
)
self
.
normalize_fact
=
float
(
hidden_dim
/
self
.
num_heads
)
**-
0.5
def
forward
(
self
,
q
,
k
,
mask
=
None
):
q
=
self
.
q_proj
(
q
)
k
=
self
.
k_proj
(
k
)
bs
,
num_queries
,
n
,
c
,
h
,
w
=
q
.
shape
[
0
],
q
.
shape
[
1
],
self
.
num_heads
,
\
self
.
hidden_dim
//
self
.
num_heads
,
k
.
shape
[
-
2
],
k
.
shape
[
-
1
]
qh
=
q
.
reshape
([
bs
,
num_queries
,
n
,
c
])
kh
=
k
.
reshape
([
bs
,
n
,
c
,
h
,
w
])
# weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
qh
=
qh
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
-
1
,
num_queries
,
c
])
kh
=
kh
.
reshape
([
-
1
,
c
,
h
*
w
])
weights
=
paddle
.
bmm
(
qh
*
self
.
normalize_fact
,
kh
).
reshape
(
[
bs
,
n
,
num_queries
,
h
,
w
]).
transpose
([
0
,
2
,
1
,
3
,
4
])
if
mask
is
not
None
:
weights
+=
mask
# fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
weights
=
F
.
softmax
(
weights
.
flatten
(
3
),
axis
=-
1
).
reshape
(
weights
.
shape
)
weights
=
self
.
dropout
(
weights
)
return
weights
class
MaskHeadFPNConv
(
nn
.
Layer
):
"""
Simple convolutional head, using group norm.
Upsampling is done using a FPN approach
"""
def
__init__
(
self
,
input_dim
,
fpn_dims
,
context_dim
,
num_groups
=
8
):
super
().
__init__
()
inter_dims
=
[
input_dim
,
]
+
[
context_dim
//
(
2
**
i
)
for
i
in
range
(
1
,
5
)]
weight_attr
=
paddle
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
KaimingUniform
())
bias_attr
=
paddle
.
framework
.
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
())
self
.
conv0
=
self
.
_make_layers
(
input_dim
,
input_dim
,
3
,
num_groups
,
weight_attr
,
bias_attr
)
self
.
conv_inter
=
nn
.
LayerList
()
for
in_dims
,
out_dims
in
zip
(
inter_dims
[:
-
1
],
inter_dims
[
1
:]):
self
.
conv_inter
.
append
(
self
.
_make_layers
(
in_dims
,
out_dims
,
3
,
num_groups
,
weight_attr
,
bias_attr
))
self
.
conv_out
=
nn
.
Conv2D
(
inter_dims
[
-
1
],
1
,
3
,
padding
=
1
,
weight_attr
=
weight_attr
,
bias_attr
=
bias_attr
)
self
.
adapter
=
nn
.
LayerList
()
for
i
in
range
(
len
(
fpn_dims
)):
self
.
adapter
.
append
(
nn
.
Conv2D
(
fpn_dims
[
i
],
inter_dims
[
i
+
1
],
1
,
weight_attr
=
weight_attr
,
bias_attr
=
bias_attr
))
def
_make_layers
(
self
,
in_dims
,
out_dims
,
kernel_size
,
num_groups
,
weight_attr
=
None
,
bias_attr
=
None
):
return
nn
.
Sequential
(
nn
.
Conv2D
(
in_dims
,
out_dims
,
kernel_size
,
padding
=
kernel_size
//
2
,
weight_attr
=
weight_attr
,
bias_attr
=
bias_attr
),
nn
.
GroupNorm
(
num_groups
,
out_dims
),
nn
.
ReLU
())
def
forward
(
self
,
x
,
bbox_attention_map
,
fpns
):
x
=
paddle
.
concat
([
x
.
tile
([
bbox_attention_map
.
shape
[
1
],
1
,
1
,
1
]),
bbox_attention_map
.
flatten
(
0
,
1
)
],
1
)
x
=
self
.
conv0
(
x
)
for
inter_layer
,
adapter_layer
,
feat
in
zip
(
self
.
conv_inter
[:
-
1
],
self
.
adapter
,
fpns
):
feat
=
adapter_layer
(
feat
).
tile
(
[
bbox_attention_map
.
shape
[
1
],
1
,
1
,
1
])
x
=
inter_layer
(
x
)
x
=
feat
+
F
.
interpolate
(
x
,
size
=
feat
.
shape
[
-
2
:])
x
=
self
.
conv_inter
[
-
1
](
x
)
x
=
self
.
conv_out
(
x
)
return
x
@
register
class
DETRHead
(
nn
.
Layer
):
__shared__
=
[
'num_classes'
,
'hidden_dim'
,
'use_focal_loss'
]
__inject__
=
[
'loss'
]
def
__init__
(
self
,
num_classes
=
80
,
hidden_dim
=
256
,
nhead
=
8
,
num_mlp_layers
=
3
,
loss
=
'DETRLoss'
,
fpn_dims
=
[
1024
,
512
,
256
],
with_mask_head
=
False
,
use_focal_loss
=
False
):
super
(
DETRHead
,
self
).
__init__
()
# add background class
self
.
num_classes
=
num_classes
if
use_focal_loss
else
num_classes
+
1
self
.
hidden_dim
=
hidden_dim
self
.
loss
=
loss
self
.
with_mask_head
=
with_mask_head
self
.
use_focal_loss
=
use_focal_loss
self
.
score_head
=
nn
.
Linear
(
hidden_dim
,
self
.
num_classes
)
self
.
bbox_head
=
MLP
(
hidden_dim
,
hidden_dim
,
output_dim
=
4
,
num_layers
=
num_mlp_layers
)
if
self
.
with_mask_head
:
self
.
bbox_attention
=
MultiHeadAttentionMap
(
hidden_dim
,
hidden_dim
,
nhead
)
self
.
mask_head
=
MaskHeadFPNConv
(
hidden_dim
+
nhead
,
fpn_dims
,
hidden_dim
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
linear_init_
(
self
.
score_head
)
@
classmethod
def
from_config
(
cls
,
cfg
,
hidden_dim
,
nhead
,
input_shape
):
return
{
'hidden_dim'
:
hidden_dim
,
'nhead'
:
nhead
,
'fpn_dims'
:
[
i
.
channels
for
i
in
input_shape
[::
-
1
]][
1
:]
}
@
staticmethod
def
get_gt_mask_from_polygons
(
gt_poly
,
pad_mask
):
out_gt_mask
=
[]
for
polygons
,
padding
in
zip
(
gt_poly
,
pad_mask
):
height
,
width
=
int
(
padding
[:,
0
].
sum
()),
int
(
padding
[
0
,
:].
sum
())
masks
=
[]
for
obj_poly
in
polygons
:
rles
=
mask_util
.
frPyObjects
(
obj_poly
,
height
,
width
)
rle
=
mask_util
.
merge
(
rles
)
masks
.
append
(
paddle
.
to_tensor
(
mask_util
.
decode
(
rle
)).
astype
(
'float32'
))
masks
=
paddle
.
stack
(
masks
)
masks_pad
=
paddle
.
zeros
(
[
masks
.
shape
[
0
],
pad_mask
.
shape
[
1
],
pad_mask
.
shape
[
2
]])
masks_pad
[:,
:
height
,
:
width
]
=
masks
out_gt_mask
.
append
(
masks_pad
)
return
out_gt_mask
def
forward
(
self
,
out_transformer
,
body_feats
,
inputs
=
None
):
r
"""
Args:
out_transformer (Tuple): (feats: [num_levels, batch_size,
num_queries, hidden_dim],
memory: [batch_size, hidden_dim, h, w],
src_proj: [batch_size, h*w, hidden_dim],
src_mask: [batch_size, 1, 1, h, w])
body_feats (List(Tensor)): list[[B, C, H, W]]
inputs (dict): dict(inputs)
"""
feats
,
memory
,
src_proj
,
src_mask
=
out_transformer
outputs_logit
=
self
.
score_head
(
feats
)
outputs_bbox
=
F
.
sigmoid
(
self
.
bbox_head
(
feats
))
outputs_seg
=
None
if
self
.
with_mask_head
:
bbox_attention_map
=
self
.
bbox_attention
(
feats
[
-
1
],
memory
,
src_mask
)
fpn_feats
=
[
a
for
a
in
body_feats
[::
-
1
]][
1
:]
outputs_seg
=
self
.
mask_head
(
src_proj
,
bbox_attention_map
,
fpn_feats
)
outputs_seg
=
outputs_seg
.
reshape
([
feats
.
shape
[
1
],
feats
.
shape
[
2
],
outputs_seg
.
shape
[
-
2
],
outputs_seg
.
shape
[
-
1
]
])
if
self
.
training
:
assert
inputs
is
not
None
assert
'gt_bbox'
in
inputs
and
'gt_class'
in
inputs
gt_mask
=
self
.
get_gt_mask_from_polygons
(
inputs
[
'gt_poly'
],
inputs
[
'pad_mask'
])
if
'gt_poly'
in
inputs
else
None
return
self
.
loss
(
outputs_bbox
,
outputs_logit
,
inputs
[
'gt_bbox'
],
inputs
[
'gt_class'
],
masks
=
outputs_seg
,
gt_mask
=
gt_mask
)
else
:
return
(
outputs_bbox
[
-
1
],
outputs_logit
[
-
1
],
outputs_seg
)
ppdet/modeling/initializer.py
浏览文件 @
3c1c576d
...
...
@@ -28,6 +28,8 @@ __all__ = [
'xavier_normal_'
,
'kaiming_uniform_'
,
'kaiming_normal_'
,
'linear_init_'
,
'conv_init_'
,
'reset_initialized_parameter'
,
]
...
...
@@ -46,7 +48,7 @@ def _no_grad_normal_(tensor, mean=0., std=1.):
return
tensor
def
_no_grad_fill_
(
tensor
,
value
=
0
):
def
_no_grad_fill_
(
tensor
,
value
=
0
.
):
with
paddle
.
no_grad
():
v
=
paddle
.
rand
(
shape
=
tensor
.
shape
,
dtype
=
tensor
.
dtype
)
v
[...]
=
value
...
...
@@ -80,7 +82,7 @@ def normal_(tensor, mean=0., std=1.):
return
_no_grad_normal_
(
tensor
,
mean
,
std
)
def
constant_
(
tensor
,
value
=
0
):
def
constant_
(
tensor
,
value
=
0
.
):
"""
Modified tensor inspace using constant_
Args:
...
...
@@ -150,7 +152,7 @@ def xavier_uniform_(tensor, gain=1., reverse=False):
Modified tensor inspace using xavier_uniform_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (
str
): super parameter, 1. default.
gain (
float
): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
...
...
@@ -166,7 +168,7 @@ def xavier_normal_(tensor, gain=1., reverse=False):
Modified tensor inspace using xavier_normal_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (
str
): super parameter, 1. default.
gain (
float
): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
...
...
@@ -260,6 +262,18 @@ def kaiming_normal_(tensor,
return
_no_grad_normal_
(
tensor
,
0
,
std
)
def
linear_init_
(
module
):
bound
=
1
/
math
.
sqrt
(
module
.
weight
.
shape
[
0
])
uniform_
(
module
.
weight
,
-
bound
,
bound
)
uniform_
(
module
.
bias
,
-
bound
,
bound
)
def
conv_init_
(
module
):
bound
=
1
/
math
.
sqrt
(
math
.
prod
(
module
.
weight
.
shape
[
1
:]))
uniform_
(
module
.
weight
,
-
bound
,
bound
)
uniform_
(
module
.
bias
,
-
bound
,
bound
)
@
paddle
.
no_grad
()
def
reset_initialized_parameter
(
model
,
include_self
=
True
):
"""
...
...
ppdet/modeling/layers.py
浏览文件 @
3c1c576d
...
...
@@ -29,8 +29,11 @@ from paddle.regularizer import L2Decay
from
ppdet.core.workspace
import
register
,
serializable
from
ppdet.modeling.bbox_utils
import
delta2bbox
from
.
import
ops
from
.initializer
import
xavier_uniform_
,
constant_
from
paddle.vision.ops
import
DeformConv2D
from
paddle.nn.layer
import
transformer
_convert_attention_mask
=
transformer
.
_convert_attention_mask
def
_to_list
(
l
):
...
...
@@ -1187,3 +1190,179 @@ class Concat(nn.Layer):
def
extra_repr
(
self
):
return
'dim={}'
.
format
(
self
.
dim
)
class
MultiHeadAttention
(
nn
.
Layer
):
"""
Attention mapps queries and a set of key-value pairs to outputs, and
Multi-Head Attention performs multiple parallel attention to jointly attending
to information from different representation subspaces.
Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
for more details.
Parameters:
embed_dim (int): The expected feature size in the input and output.
num_heads (int): The number of heads in multi-head attention.
dropout (float, optional): The dropout probability used on attention
weights to drop some attention targets. 0 for no dropout. Default 0
kdim (int, optional): The feature size in key. If None, assumed equal to
`embed_dim`. Default None.
vdim (int, optional): The feature size in value. If None, assumed equal to
`embed_dim`. Default None.
need_weights (bool, optional): Indicate whether to return the attention
weights. Default False.
Examples:
.. code-block:: python
import paddle
# encoder input: [batch_size, sequence_length, d_model]
query = paddle.rand((2, 4, 128))
# self attention mask: [batch_size, num_heads, query_len, query_len]
attn_mask = paddle.rand((2, 2, 4, 4))
multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
"""
def
__init__
(
self
,
embed_dim
,
num_heads
,
dropout
=
0.
,
kdim
=
None
,
vdim
=
None
,
need_weights
=
False
):
super
(
MultiHeadAttention
,
self
).
__init__
()
self
.
embed_dim
=
embed_dim
self
.
kdim
=
kdim
if
kdim
is
not
None
else
embed_dim
self
.
vdim
=
vdim
if
vdim
is
not
None
else
embed_dim
self
.
_qkv_same_embed_dim
=
self
.
kdim
==
embed_dim
and
self
.
vdim
==
embed_dim
self
.
num_heads
=
num_heads
self
.
dropout
=
dropout
self
.
need_weights
=
need_weights
self
.
head_dim
=
embed_dim
//
num_heads
assert
self
.
head_dim
*
num_heads
==
self
.
embed_dim
,
"embed_dim must be divisible by num_heads"
if
self
.
_qkv_same_embed_dim
:
self
.
in_proj_weight
=
self
.
create_parameter
(
shape
=
[
embed_dim
,
3
*
embed_dim
],
attr
=
None
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
self
.
in_proj_bias
=
self
.
create_parameter
(
shape
=
[
3
*
embed_dim
],
attr
=
None
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
else
:
self
.
q_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
)
self
.
k_proj
=
nn
.
Linear
(
self
.
kdim
,
embed_dim
)
self
.
v_proj
=
nn
.
Linear
(
self
.
vdim
,
embed_dim
)
self
.
out_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
)
self
.
_type_list
=
(
'q_proj'
,
'k_proj'
,
'v_proj'
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
xavier_uniform_
(
p
)
else
:
constant_
(
p
)
def
compute_qkv
(
self
,
tensor
,
index
):
if
self
.
_qkv_same_embed_dim
:
tensor
=
F
.
linear
(
x
=
tensor
,
weight
=
self
.
in_proj_weight
[:,
index
*
self
.
embed_dim
:(
index
+
1
)
*
self
.
embed_dim
],
bias
=
self
.
in_proj_bias
[
index
*
self
.
embed_dim
:(
index
+
1
)
*
self
.
embed_dim
]
if
self
.
in_proj_bias
is
not
None
else
None
)
else
:
tensor
=
getattr
(
self
,
self
.
_type_list
[
index
])(
tensor
)
tensor
=
tensor
.
reshape
(
[
0
,
0
,
self
.
num_heads
,
self
.
head_dim
]).
transpose
([
0
,
2
,
1
,
3
])
return
tensor
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
attn_mask
=
None
):
r
"""
Applies multi-head attention to map queries and a set of key-value pairs
to outputs.
Parameters:
query (Tensor): The queries for multi-head attention. It is a
tensor with shape `[batch_size, query_length, embed_dim]`. The
data type should be float32 or float64.
key (Tensor, optional): The keys for multi-head attention. It is
a tensor with shape `[batch_size, key_length, kdim]`. The
data type should be float32 or float64. If None, use `query` as
`key`. Default None.
value (Tensor, optional): The values for multi-head attention. It
is a tensor with shape `[batch_size, value_length, vdim]`.
The data type should be float32 or float64. If None, use `query` as
`value`. Default None.
attn_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
When the data type is bool, the unwanted positions have `False`
values and the others have `True` values. When the data type is
int, the unwanted positions have 0 values and the others have 1
values. When the data type is float, the unwanted positions have
`-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None.
Returns:
Tensor|tuple: It is a tensor that has the same shape and data type \
as `query`, representing attention output. Or a tuple if \
`need_weights` is True or `cache` is not None. If `need_weights` \
is True, except for attention output, the tuple also includes \
the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
If `cache` is not None, the tuple then includes the new cache \
having the same type as `cache`, and if it is `StaticCache`, it \
is same as the input `cache`, if it is `Cache`, the new cache \
reserves tensors concatanating raw tensors with intermediate \
results of current query.
"""
key
=
query
if
key
is
None
else
key
value
=
query
if
value
is
None
else
value
# compute q ,k ,v
q
,
k
,
v
=
(
self
.
compute_qkv
(
t
,
i
)
for
i
,
t
in
enumerate
([
query
,
key
,
value
]))
# scale dot product attention
product
=
paddle
.
matmul
(
x
=
q
,
y
=
k
,
transpose_y
=
True
)
scaling
=
float
(
self
.
head_dim
)
**-
0.5
product
=
product
*
scaling
if
attn_mask
is
not
None
:
# Support bool or int mask
attn_mask
=
_convert_attention_mask
(
attn_mask
,
product
.
dtype
)
product
=
product
+
attn_mask
weights
=
F
.
softmax
(
product
)
if
self
.
dropout
:
weights
=
F
.
dropout
(
weights
,
self
.
dropout
,
training
=
self
.
training
,
mode
=
"upscale_in_train"
)
out
=
paddle
.
matmul
(
weights
,
v
)
# combine heads
out
=
paddle
.
transpose
(
out
,
perm
=
[
0
,
2
,
1
,
3
])
out
=
paddle
.
reshape
(
x
=
out
,
shape
=
[
0
,
0
,
out
.
shape
[
2
]
*
out
.
shape
[
3
]])
# project to output
out
=
self
.
out_proj
(
out
)
outs
=
[
out
]
if
self
.
need_weights
:
outs
.
append
(
weights
)
return
out
if
len
(
outs
)
==
1
else
tuple
(
outs
)
ppdet/modeling/losses/__init__.py
浏览文件 @
3c1c576d
...
...
@@ -22,6 +22,7 @@ from . import ctfocal_loss
from
.
import
keypoint_loss
from
.
import
jde_loss
from
.
import
fairmot_loss
from
.
import
detr_loss
from
.yolo_loss
import
*
from
.iou_aware_loss
import
*
...
...
@@ -33,3 +34,4 @@ from .ctfocal_loss import *
from
.keypoint_loss
import
*
from
.jde_loss
import
*
from
.fairmot_loss
import
*
from
.detr_loss
import
*
ppdet/modeling/losses/detr_loss.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
ppdet.core.workspace
import
register
from
.iou_loss
import
GIoULoss
from
..transformers
import
bbox_cxcywh_to_xyxy
,
bbox_overlaps
,
sigmoid_focal_loss
__all__
=
[
'DETRLoss'
]
@
register
class
DETRLoss
(
nn
.
Layer
):
__shared__
=
[
'num_classes'
,
'use_focal_loss'
]
__inject__
=
[
'matcher'
]
def
__init__
(
self
,
num_classes
=
80
,
matcher
=
'HungarianMatcher'
,
loss_coeff
=
{
'class'
:
1
,
'bbox'
:
5
,
'giou'
:
2
,
'no_object'
:
0.1
,
'mask'
:
1
,
'dice'
:
1
},
aux_loss
=
True
,
use_focal_loss
=
False
):
r
"""
Args:
num_classes (int): The number of classes.
matcher (HungarianMatcher): It computes an assignment between the targets
and the predictions of the network.
loss_coeff (dict): The coefficient of loss.
aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
use_focal_loss (bool): Use focal loss or not.
"""
super
(
DETRLoss
,
self
).
__init__
()
self
.
num_classes
=
num_classes
self
.
matcher
=
matcher
self
.
loss_coeff
=
loss_coeff
self
.
aux_loss
=
aux_loss
self
.
use_focal_loss
=
use_focal_loss
if
not
self
.
use_focal_loss
:
self
.
loss_coeff
[
'class'
]
=
paddle
.
full
([
num_classes
+
1
],
loss_coeff
[
'class'
])
self
.
loss_coeff
[
'class'
][
-
1
]
=
loss_coeff
[
'no_object'
]
self
.
giou_loss
=
GIoULoss
()
def
_get_loss_class
(
self
,
logits
,
gt_class
,
match_indices
,
bg_index
,
num_gts
):
# logits: [b, query, num_classes], gt_class: list[[n, 1]]
target_label
=
paddle
.
full
(
logits
.
shape
[:
2
],
bg_index
,
dtype
=
'int64'
)
bs
,
num_query_objects
=
target_label
.
shape
if
sum
(
len
(
a
)
for
a
in
gt_class
)
>
0
:
index
,
updates
=
self
.
_get_index_updates
(
num_query_objects
,
gt_class
,
match_indices
)
target_label
=
paddle
.
scatter
(
target_label
.
reshape
([
-
1
,
1
]),
index
,
updates
.
astype
(
'int64'
))
target_label
=
target_label
.
reshape
([
bs
,
num_query_objects
])
if
self
.
use_focal_loss
:
target_label
=
F
.
one_hot
(
target_label
,
self
.
num_classes
+
1
)[:,
:,
:
-
1
]
return
{
'loss_class'
:
self
.
loss_coeff
[
'class'
]
*
sigmoid_focal_loss
(
logits
,
target_label
,
num_gts
/
num_query_objects
)
if
self
.
use_focal_loss
else
F
.
cross_entropy
(
logits
,
target_label
,
weight
=
self
.
loss_coeff
[
'class'
])
}
def
_get_loss_bbox
(
self
,
boxes
,
gt_bbox
,
match_indices
,
num_gts
):
# boxes: [b, query, 4], gt_bbox: list[[n, 4]]
loss
=
dict
()
if
sum
(
len
(
a
)
for
a
in
gt_bbox
)
==
0
:
loss
[
'loss_bbox'
]
=
paddle
.
to_tensor
([
0.
])
loss
[
'loss_giou'
]
=
paddle
.
to_tensor
([
0.
])
return
loss
src_bbox
,
target_bbox
=
self
.
_get_src_target_assign
(
boxes
,
gt_bbox
,
match_indices
)
loss
[
'loss_bbox'
]
=
self
.
loss_coeff
[
'bbox'
]
*
F
.
l1_loss
(
src_bbox
,
target_bbox
,
reduction
=
'sum'
)
/
num_gts
loss
[
'loss_giou'
]
=
self
.
giou_loss
(
bbox_cxcywh_to_xyxy
(
src_bbox
),
bbox_cxcywh_to_xyxy
(
target_bbox
))
loss
[
'loss_giou'
]
=
loss
[
'loss_giou'
].
sum
()
/
num_gts
loss
[
'loss_giou'
]
=
self
.
loss_coeff
[
'giou'
]
*
loss
[
'loss_giou'
]
return
loss
def
_get_loss_mask
(
self
,
masks
,
gt_mask
,
match_indices
,
num_gts
):
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
loss
=
dict
()
if
sum
(
len
(
a
)
for
a
in
gt_mask
)
==
0
:
loss
[
'loss_mask'
]
=
paddle
.
to_tensor
([
0.
])
loss
[
'loss_dice'
]
=
paddle
.
to_tensor
([
0.
])
return
loss
src_masks
,
target_masks
=
self
.
_get_src_target_assign
(
masks
,
gt_mask
,
match_indices
)
src_masks
=
F
.
interpolate
(
src_masks
.
unsqueeze
(
0
),
size
=
target_masks
.
shape
[
-
2
:],
mode
=
"bilinear"
)[
0
]
loss
[
'loss_mask'
]
=
self
.
loss_coeff
[
'mask'
]
*
F
.
sigmoid_focal_loss
(
src_masks
,
target_masks
,
paddle
.
to_tensor
(
[
num_gts
],
dtype
=
'float32'
))
loss
[
'loss_dice'
]
=
self
.
loss_coeff
[
'dice'
]
*
self
.
_dice_loss
(
src_masks
,
target_masks
,
num_gts
)
return
loss
def
_dice_loss
(
self
,
inputs
,
targets
,
num_gts
):
inputs
=
F
.
sigmoid
(
inputs
)
inputs
=
inputs
.
flatten
(
1
)
targets
=
targets
.
flatten
(
1
)
numerator
=
2
*
(
inputs
*
targets
).
sum
(
1
)
denominator
=
inputs
.
sum
(
-
1
)
+
targets
.
sum
(
-
1
)
loss
=
1
-
(
numerator
+
1
)
/
(
denominator
+
1
)
return
loss
.
sum
()
/
num_gts
def
_get_loss_aux
(
self
,
boxes
,
logits
,
gt_bbox
,
gt_class
,
bg_index
,
num_gts
):
loss_class
=
[]
loss_bbox
=
[]
loss_giou
=
[]
for
aux_boxes
,
aux_logits
in
zip
(
boxes
,
logits
):
match_indices
=
self
.
matcher
(
aux_boxes
,
aux_logits
,
gt_bbox
,
gt_class
)
loss_class
.
append
(
self
.
_get_loss_class
(
aux_logits
,
gt_class
,
match_indices
,
bg_index
,
num_gts
)[
'loss_class'
])
loss_
=
self
.
_get_loss_bbox
(
aux_boxes
,
gt_bbox
,
match_indices
,
num_gts
)
loss_bbox
.
append
(
loss_
[
'loss_bbox'
])
loss_giou
.
append
(
loss_
[
'loss_giou'
])
loss
=
{
'loss_class_aux'
:
paddle
.
add_n
(
loss_class
),
'loss_bbox_aux'
:
paddle
.
add_n
(
loss_bbox
),
'loss_giou_aux'
:
paddle
.
add_n
(
loss_giou
)
}
return
loss
def
_get_index_updates
(
self
,
num_query_objects
,
target
,
match_indices
):
batch_idx
=
paddle
.
concat
([
paddle
.
full_like
(
src
,
i
)
for
i
,
(
src
,
_
)
in
enumerate
(
match_indices
)
])
src_idx
=
paddle
.
concat
([
src
for
(
src
,
_
)
in
match_indices
])
src_idx
+=
(
batch_idx
*
num_query_objects
)
target_assign
=
paddle
.
concat
([
paddle
.
gather
(
t
,
dst
,
axis
=
0
)
for
t
,
(
_
,
dst
)
in
zip
(
target
,
match_indices
)
])
return
src_idx
,
target_assign
def
_get_src_target_assign
(
self
,
src
,
target
,
match_indices
):
src_assign
=
paddle
.
concat
([
paddle
.
gather
(
t
,
I
,
axis
=
0
)
if
len
(
I
)
>
0
else
paddle
.
zeros
([
0
,
t
.
shape
[
-
1
]])
for
t
,
(
I
,
_
)
in
zip
(
src
,
match_indices
)
])
target_assign
=
paddle
.
concat
([
paddle
.
gather
(
t
,
J
,
axis
=
0
)
if
len
(
J
)
>
0
else
paddle
.
zeros
([
0
,
t
.
shape
[
-
1
]])
for
t
,
(
_
,
J
)
in
zip
(
target
,
match_indices
)
])
return
src_assign
,
target_assign
def
forward
(
self
,
boxes
,
logits
,
gt_bbox
,
gt_class
,
masks
=
None
,
gt_mask
=
None
):
r
"""
Args:
boxes (Tensor): [l, b, query, 4]
logits (Tensor): [l, b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
masks (Tensor, optional): [b, query, h, w]
gt_mask (List(Tensor), optional): list[[n, H, W]]
"""
match_indices
=
self
.
matcher
(
boxes
[
-
1
].
detach
(),
logits
[
-
1
].
detach
(),
gt_bbox
,
gt_class
)
num_gts
=
sum
(
len
(
a
)
for
a
in
gt_bbox
)
try
:
# TODO: Paddle does not have a "paddle.distributed.is_initialized()"
num_gts
=
paddle
.
to_tensor
([
num_gts
],
dtype
=
paddle
.
float32
)
paddle
.
distributed
.
all_reduce
(
num_gts
)
num_gts
=
paddle
.
clip
(
num_gts
/
paddle
.
distributed
.
get_world_size
(),
min
=
1
).
item
()
except
:
num_gts
=
max
(
num_gts
,
1
)
total_loss
=
dict
()
total_loss
.
update
(
self
.
_get_loss_class
(
logits
[
-
1
],
gt_class
,
match_indices
,
self
.
num_classes
,
num_gts
))
total_loss
.
update
(
self
.
_get_loss_bbox
(
boxes
[
-
1
],
gt_bbox
,
match_indices
,
num_gts
))
if
masks
is
not
None
and
gt_mask
is
not
None
:
total_loss
.
update
(
self
.
_get_loss_mask
(
masks
,
gt_mask
,
match_indices
,
num_gts
))
if
self
.
aux_loss
:
total_loss
.
update
(
self
.
_get_loss_aux
(
boxes
[:
-
1
],
logits
[:
-
1
],
gt_bbox
,
gt_class
,
self
.
num_classes
,
num_gts
))
return
total_loss
ppdet/modeling/post_process.py
浏览文件 @
3c1c576d
...
...
@@ -19,18 +19,16 @@ import paddle.nn.functional as F
from
ppdet.core.workspace
import
register
from
ppdet.modeling.bbox_utils
import
nonempty_bbox
,
rbox2poly
,
rbox2poly
from
ppdet.modeling.layers
import
TTFBox
from
.transformers
import
bbox_cxcywh_to_xyxy
try
:
from
collections.abc
import
Sequence
except
Exception
:
from
collections
import
Sequence
__all__
=
[
'BBoxPostProcess'
,
'MaskPostProcess'
,
'FCOSPostProcess'
,
'S2ANetBBoxPostProcess'
,
'JDEBBoxPostProcess'
,
'CenterNetPostProcess'
,
'BBoxPostProcess'
,
'MaskPostProcess'
,
'FCOSPostProcess'
,
'S2ANetBBoxPostProcess'
,
'JDEBBoxPostProcess'
,
'CenterNetPostProcess'
,
'DETRBBoxPostProcess'
]
...
...
@@ -492,3 +490,64 @@ class CenterNetPostProcess(TTFBox):
else
:
results
=
paddle
.
concat
([
clses
,
scores
,
bboxes
],
axis
=
1
)
return
results
,
paddle
.
shape
(
results
)[
0
:
1
]
@
register
class
DETRBBoxPostProcess
(
object
):
__shared__
=
[
'num_classes'
,
'use_focal_loss'
]
__inject__
=
[]
def
__init__
(
self
,
num_classes
=
80
,
num_top_queries
=
100
,
use_focal_loss
=
False
):
super
(
DETRBBoxPostProcess
,
self
).
__init__
()
self
.
num_classes
=
num_classes
self
.
num_top_queries
=
num_top_queries
self
.
use_focal_loss
=
use_focal_loss
def
__call__
(
self
,
head_out
,
im_shape
,
scale_factor
):
"""
Decode the bbox.
Args:
head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
im_shape (Tensor): The shape of the input image.
scale_factor (Tensor): The scale factor of the input image.
Returns:
bbox_pred (Tensor): The output prediction with shape [N, 6], including
labels, scores and bboxes. The size of bboxes are corresponding
to the input image, the bboxes may be used in other branch.
bbox_num (Tensor): The number of prediction boxes of each batch with
shape [bs], and is N.
"""
bboxes
,
logits
,
masks
=
head_out
bbox_pred
=
bbox_cxcywh_to_xyxy
(
bboxes
)
origin_shape
=
paddle
.
floor
(
im_shape
/
scale_factor
+
0.5
)
img_h
,
img_w
=
origin_shape
.
unbind
(
1
)
origin_shape
=
paddle
.
stack
(
[
img_w
,
img_h
,
img_w
,
img_h
],
axis
=-
1
).
unsqueeze
(
0
)
bbox_pred
*=
origin_shape
scores
=
F
.
sigmoid
(
logits
)
if
self
.
use_focal_loss
else
F
.
softmax
(
logits
)[:,
:,
:
-
1
]
scores
,
labels
=
scores
.
max
(
-
1
),
scores
.
argmax
(
-
1
)
if
scores
.
shape
[
1
]
>
self
.
num_top_queries
:
scores
,
index
=
paddle
.
topk
(
scores
,
self
.
num_top_queries
,
axis
=-
1
)
labels
=
paddle
.
stack
(
[
paddle
.
gather
(
l
,
i
)
for
l
,
i
in
zip
(
labels
,
index
)])
bbox_pred
=
paddle
.
stack
(
[
paddle
.
gather
(
b
,
i
)
for
b
,
i
in
zip
(
bbox_pred
,
index
)])
bbox_pred
=
paddle
.
concat
(
[
labels
.
unsqueeze
(
-
1
).
astype
(
'float32'
),
scores
.
unsqueeze
(
-
1
),
bbox_pred
],
axis
=-
1
)
bbox_num
=
paddle
.
to_tensor
(
bbox_pred
.
shape
[
1
],
dtype
=
'int32'
).
tile
([
bbox_pred
.
shape
[
0
]])
bbox_pred
=
bbox_pred
.
reshape
([
-
1
,
6
])
return
bbox_pred
,
bbox_num
ppdet/modeling/transformers/__init__.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.
import
detr_transformer
from
.
import
utils
from
.
import
matchers
from
.
import
position_encoding
from
.detr_transformer
import
*
from
.utils
import
*
from
.matchers
import
*
from
.position_encoding
import
*
ppdet/modeling/transformers/detr_transformer.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
import
paddle.nn
as
nn
from
paddle.nn.layer.transformer
import
_convert_attention_mask
import
paddle.nn.functional
as
F
from
ppdet.core.workspace
import
register
from
..layers
import
MultiHeadAttention
from
.position_encoding
import
PositionEmbedding
from
.utils
import
*
from
..initializer
import
*
__all__
=
[
'DETRTransformer'
]
class
TransformerEncoderLayer
(
nn
.
Layer
):
def
__init__
(
self
,
d_model
,
nhead
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
attn_dropout
=
None
,
act_dropout
=
None
,
normalize_before
=
False
):
super
(
TransformerEncoderLayer
,
self
).
__init__
()
attn_dropout
=
dropout
if
attn_dropout
is
None
else
attn_dropout
act_dropout
=
dropout
if
act_dropout
is
None
else
act_dropout
self
.
normalize_before
=
normalize_before
self
.
self_attn
=
MultiHeadAttention
(
d_model
,
nhead
,
attn_dropout
)
# Implementation of Feedforward model
self
.
linear1
=
nn
.
Linear
(
d_model
,
dim_feedforward
)
self
.
dropout
=
nn
.
Dropout
(
act_dropout
,
mode
=
"upscale_in_train"
)
self
.
linear2
=
nn
.
Linear
(
dim_feedforward
,
d_model
)
self
.
norm1
=
nn
.
LayerNorm
(
d_model
)
self
.
norm2
=
nn
.
LayerNorm
(
d_model
)
self
.
dropout1
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
dropout2
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
activation
=
getattr
(
F
,
activation
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
linear_init_
(
self
.
linear1
)
linear_init_
(
self
.
linear2
)
@
staticmethod
def
with_pos_embed
(
tensor
,
pos_embed
):
return
tensor
if
pos_embed
is
None
else
tensor
+
pos_embed
def
forward
(
self
,
src
,
src_mask
=
None
,
pos_embed
=
None
):
src_mask
=
_convert_attention_mask
(
src_mask
,
src
.
dtype
)
residual
=
src
if
self
.
normalize_before
:
src
=
self
.
norm1
(
src
)
q
=
k
=
self
.
with_pos_embed
(
src
,
pos_embed
)
src
=
self
.
self_attn
(
q
,
k
,
value
=
src
,
attn_mask
=
src_mask
)
src
=
residual
+
self
.
dropout1
(
src
)
if
not
self
.
normalize_before
:
src
=
self
.
norm1
(
src
)
residual
=
src
if
self
.
normalize_before
:
src
=
self
.
norm2
(
src
)
src
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
src
))))
src
=
residual
+
self
.
dropout2
(
src
)
if
not
self
.
normalize_before
:
src
=
self
.
norm2
(
src
)
return
src
class
TransformerEncoder
(
nn
.
Layer
):
def
__init__
(
self
,
encoder_layer
,
num_layers
,
norm
=
None
):
super
(
TransformerEncoder
,
self
).
__init__
()
self
.
layers
=
_get_clones
(
encoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
self
.
norm
=
norm
def
forward
(
self
,
src
,
src_mask
=
None
,
pos_embed
=
None
):
src_mask
=
_convert_attention_mask
(
src_mask
,
src
.
dtype
)
output
=
src
for
layer
in
self
.
layers
:
output
=
layer
(
output
,
src_mask
=
src_mask
,
pos_embed
=
pos_embed
)
if
self
.
norm
is
not
None
:
output
=
self
.
norm
(
output
)
return
output
class
TransformerDecoderLayer
(
nn
.
Layer
):
def
__init__
(
self
,
d_model
,
nhead
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
attn_dropout
=
None
,
act_dropout
=
None
,
normalize_before
=
False
):
super
(
TransformerDecoderLayer
,
self
).
__init__
()
attn_dropout
=
dropout
if
attn_dropout
is
None
else
attn_dropout
act_dropout
=
dropout
if
act_dropout
is
None
else
act_dropout
self
.
normalize_before
=
normalize_before
self
.
self_attn
=
MultiHeadAttention
(
d_model
,
nhead
,
attn_dropout
)
self
.
cross_attn
=
MultiHeadAttention
(
d_model
,
nhead
,
attn_dropout
)
# Implementation of Feedforward model
self
.
linear1
=
nn
.
Linear
(
d_model
,
dim_feedforward
)
self
.
dropout
=
nn
.
Dropout
(
act_dropout
,
mode
=
"upscale_in_train"
)
self
.
linear2
=
nn
.
Linear
(
dim_feedforward
,
d_model
)
self
.
norm1
=
nn
.
LayerNorm
(
d_model
)
self
.
norm2
=
nn
.
LayerNorm
(
d_model
)
self
.
norm3
=
nn
.
LayerNorm
(
d_model
)
self
.
dropout1
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
dropout2
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
dropout3
=
nn
.
Dropout
(
dropout
,
mode
=
"upscale_in_train"
)
self
.
activation
=
getattr
(
F
,
activation
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
linear_init_
(
self
.
linear1
)
linear_init_
(
self
.
linear2
)
@
staticmethod
def
with_pos_embed
(
tensor
,
pos_embed
):
return
tensor
if
pos_embed
is
None
else
tensor
+
pos_embed
def
forward
(
self
,
tgt
,
memory
,
tgt_mask
=
None
,
memory_mask
=
None
,
pos_embed
=
None
,
query_pos_embed
=
None
):
tgt_mask
=
_convert_attention_mask
(
tgt_mask
,
tgt
.
dtype
)
memory_mask
=
_convert_attention_mask
(
memory_mask
,
memory
.
dtype
)
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm1
(
tgt
)
q
=
k
=
self
.
with_pos_embed
(
tgt
,
query_pos_embed
)
tgt
=
self
.
self_attn
(
q
,
k
,
value
=
tgt
,
attn_mask
=
tgt_mask
)
tgt
=
residual
+
self
.
dropout1
(
tgt
)
if
not
self
.
normalize_before
:
tgt
=
self
.
norm1
(
tgt
)
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm2
(
tgt
)
q
=
self
.
with_pos_embed
(
tgt
,
query_pos_embed
)
k
=
self
.
with_pos_embed
(
memory
,
pos_embed
)
tgt
=
self
.
cross_attn
(
q
,
k
,
value
=
memory
,
attn_mask
=
memory_mask
)
tgt
=
residual
+
self
.
dropout2
(
tgt
)
if
not
self
.
normalize_before
:
tgt
=
self
.
norm2
(
tgt
)
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm3
(
tgt
)
tgt
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
tgt
))))
tgt
=
residual
+
self
.
dropout3
(
tgt
)
if
not
self
.
normalize_before
:
tgt
=
self
.
norm3
(
tgt
)
return
tgt
class
TransformerDecoder
(
nn
.
Layer
):
def
__init__
(
self
,
decoder_layer
,
num_layers
,
norm
=
None
,
return_intermediate
=
False
):
super
(
TransformerDecoder
,
self
).
__init__
()
self
.
layers
=
_get_clones
(
decoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
self
.
norm
=
norm
self
.
return_intermediate
=
return_intermediate
def
forward
(
self
,
tgt
,
memory
,
tgt_mask
=
None
,
memory_mask
=
None
,
pos_embed
=
None
,
query_pos_embed
=
None
):
tgt_mask
=
_convert_attention_mask
(
tgt_mask
,
tgt
.
dtype
)
memory_mask
=
_convert_attention_mask
(
memory_mask
,
memory
.
dtype
)
output
=
tgt
intermediate
=
[]
for
layer
in
self
.
layers
:
output
=
layer
(
output
,
memory
,
tgt_mask
=
tgt_mask
,
memory_mask
=
memory_mask
,
pos_embed
=
pos_embed
,
query_pos_embed
=
query_pos_embed
)
if
self
.
return_intermediate
:
intermediate
.
append
(
self
.
norm
(
output
))
if
self
.
norm
is
not
None
:
output
=
self
.
norm
(
output
)
if
self
.
return_intermediate
:
return
paddle
.
stack
(
intermediate
)
return
output
.
unsqueeze
(
0
)
@
register
class
DETRTransformer
(
nn
.
Layer
):
__shared__
=
[
'hidden_dim'
]
def
__init__
(
self
,
num_queries
=
100
,
position_embed_type
=
'sine'
,
return_intermediate_dec
=
True
,
backbone_num_channels
=
2048
,
hidden_dim
=
256
,
nhead
=
8
,
num_encoder_layers
=
6
,
num_decoder_layers
=
6
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
attn_dropout
=
None
,
act_dropout
=
None
,
normalize_before
=
False
):
super
(
DETRTransformer
,
self
).
__init__
()
assert
position_embed_type
in
[
'sine'
,
'learned'
],
\
f
'ValueError: position_embed_type not supported
{
position_embed_type
}
!'
self
.
hidden_dim
=
hidden_dim
self
.
nhead
=
nhead
encoder_layer
=
TransformerEncoderLayer
(
hidden_dim
,
nhead
,
dim_feedforward
,
dropout
,
activation
,
attn_dropout
,
act_dropout
,
normalize_before
)
encoder_norm
=
nn
.
LayerNorm
(
hidden_dim
)
if
normalize_before
else
None
self
.
encoder
=
TransformerEncoder
(
encoder_layer
,
num_encoder_layers
,
encoder_norm
)
decoder_layer
=
TransformerDecoderLayer
(
hidden_dim
,
nhead
,
dim_feedforward
,
dropout
,
activation
,
attn_dropout
,
act_dropout
,
normalize_before
)
decoder_norm
=
nn
.
LayerNorm
(
hidden_dim
)
self
.
decoder
=
TransformerDecoder
(
decoder_layer
,
num_decoder_layers
,
decoder_norm
,
return_intermediate
=
return_intermediate_dec
)
self
.
input_proj
=
nn
.
Conv2D
(
backbone_num_channels
,
hidden_dim
,
kernel_size
=
1
)
self
.
query_pos_embed
=
nn
.
Embedding
(
num_queries
,
hidden_dim
)
self
.
position_embedding
=
PositionEmbedding
(
hidden_dim
//
2
,
normalize
=
True
if
position_embed_type
==
'sine'
else
False
,
embed_type
=
position_embed_type
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
xavier_uniform_
(
p
)
conv_init_
(
self
.
input_proj
)
normal_
(
self
.
query_pos_embed
.
weight
)
@
classmethod
def
from_config
(
cls
,
cfg
,
input_shape
):
return
{
'backbone_num_channels'
:
[
i
.
channels
for
i
in
input_shape
][
-
1
],
}
def
forward
(
self
,
src
,
src_mask
=
None
):
r
"""
Applies a Transformer model on the inputs.
Parameters:
src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
src_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
[bs, H, W]`. When the data type is bool, the unwanted positions
have `False` values and the others have `True` values. When the
data type is int, the unwanted positions have 0 values and the
others have 1 values. When the data type is float, the unwanted
positions have `-INF` values and the others have 0 values. It
can be None when nothing wanted or needed to be prevented
attention to. Default None.
Returns:
output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
memory (Tensor): [batch_size, hidden_dim, h, w]
"""
# use last level feature map
src_proj
=
self
.
input_proj
(
src
[
-
1
])
bs
,
c
,
h
,
w
=
src_proj
.
shape
# flatten [B, C, H, W] to [B, HxW, C]
src_flatten
=
src_proj
.
flatten
(
2
).
transpose
([
0
,
2
,
1
])
if
src_mask
is
not
None
:
src_mask
=
F
.
interpolate
(
src_mask
.
unsqueeze
(
0
).
astype
(
src_flatten
.
dtype
),
size
=
(
h
,
w
))[
0
].
astype
(
'bool'
)
else
:
src_mask
=
paddle
.
ones
([
bs
,
h
,
w
],
dtype
=
'bool'
)
pos_embed
=
self
.
position_embedding
(
src_mask
).
flatten
(
2
).
transpose
(
[
0
,
2
,
1
])
src_mask
=
_convert_attention_mask
(
src_mask
,
src_flatten
.
dtype
)
src_mask
=
src_mask
.
reshape
([
bs
,
1
,
1
,
-
1
])
memory
=
self
.
encoder
(
src_flatten
,
src_mask
=
src_mask
,
pos_embed
=
pos_embed
)
query_pos_embed
=
self
.
query_pos_embed
.
weight
.
unsqueeze
(
0
).
tile
(
[
bs
,
1
,
1
])
tgt
=
paddle
.
zeros_like
(
query_pos_embed
)
output
=
self
.
decoder
(
tgt
,
memory
,
memory_mask
=
src_mask
,
pos_embed
=
pos_embed
,
query_pos_embed
=
query_pos_embed
)
return
(
output
,
memory
.
transpose
([
0
,
2
,
1
]).
reshape
([
bs
,
c
,
h
,
w
]),
src_proj
,
src_mask
.
reshape
([
bs
,
1
,
1
,
h
,
w
]))
ppdet/modeling/transformers/matchers.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
scipy.optimize
import
linear_sum_assignment
from
ppdet.core.workspace
import
register
,
serializable
from
..losses.iou_loss
import
GIoULoss
from
.utils
import
bbox_cxcywh_to_xyxy
__all__
=
[
'HungarianMatcher'
]
@
register
@
serializable
class
HungarianMatcher
(
nn
.
Layer
):
__shared__
=
[
'use_focal_loss'
]
def
__init__
(
self
,
matcher_coeff
=
{
'class'
:
1
,
'bbox'
:
5
,
'giou'
:
2
},
use_focal_loss
=
False
,
alpha
=
0.25
,
gamma
=
2.0
):
r
"""
Args:
matcher_coeff (dict): The coefficient of hungarian matcher cost.
"""
super
(
HungarianMatcher
,
self
).
__init__
()
self
.
matcher_coeff
=
matcher_coeff
self
.
use_focal_loss
=
use_focal_loss
self
.
alpha
=
alpha
self
.
gamma
=
gamma
self
.
giou_loss
=
GIoULoss
()
def
forward
(
self
,
boxes
,
logits
,
gt_bbox
,
gt_class
):
r
"""
Args:
boxes (Tensor): [b, query, 4]
logits (Tensor): [b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
bs
,
num_queries
=
boxes
.
shape
[:
2
]
num_gts
=
sum
(
len
(
a
)
for
a
in
gt_class
)
if
num_gts
==
0
:
return
[(
paddle
.
to_tensor
(
[],
dtype
=
paddle
.
int64
),
paddle
.
to_tensor
(
[],
dtype
=
paddle
.
int64
))
for
_
in
range
(
bs
)]
# We flatten to compute the cost matrices in a batch
# [batch_size * num_queries, num_classes]
out_prob
=
F
.
sigmoid
(
logits
.
flatten
(
0
,
1
))
if
self
.
use_focal_loss
else
F
.
softmax
(
logits
.
flatten
(
0
,
1
))
# [batch_size * num_queries, 4]
out_bbox
=
boxes
.
flatten
(
0
,
1
)
# Also concat the target labels and boxes
tgt_ids
=
paddle
.
concat
(
gt_class
).
flatten
()
tgt_bbox
=
paddle
.
concat
(
gt_bbox
)
# Compute the classification cost
if
self
.
use_focal_loss
:
neg_cost_class
=
(
1
-
self
.
alpha
)
*
(
out_prob
**
self
.
gamma
)
*
(
-
(
1
-
out_prob
+
1e-8
).
log
())
pos_cost_class
=
self
.
alpha
*
(
(
1
-
out_prob
)
**
self
.
gamma
)
*
(
-
(
out_prob
+
1e-8
).
log
())
cost_class
=
paddle
.
gather
(
pos_cost_class
,
tgt_ids
,
axis
=
1
)
-
paddle
.
gather
(
neg_cost_class
,
tgt_ids
,
axis
=
1
)
else
:
cost_class
=
-
paddle
.
gather
(
out_prob
,
tgt_ids
,
axis
=
1
)
# Compute the L1 cost between boxes
cost_bbox
=
(
out_bbox
.
unsqueeze
(
1
)
-
tgt_bbox
.
unsqueeze
(
0
)).
abs
().
sum
(
-
1
)
# Compute the giou cost betwen boxes
cost_giou
=
self
.
giou_loss
(
bbox_cxcywh_to_xyxy
(
out_bbox
.
unsqueeze
(
1
)),
bbox_cxcywh_to_xyxy
(
tgt_bbox
.
unsqueeze
(
0
))).
squeeze
(
-
1
)
# Final cost matrix
C
=
self
.
matcher_coeff
[
'class'
]
*
cost_class
+
self
.
matcher_coeff
[
'bbox'
]
*
cost_bbox
+
\
self
.
matcher_coeff
[
'giou'
]
*
cost_giou
C
=
C
.
reshape
([
bs
,
num_queries
,
-
1
])
C
=
[
a
.
squeeze
(
0
)
for
a
in
C
.
chunk
(
bs
)]
sizes
=
[
a
.
shape
[
0
]
for
a
in
gt_bbox
]
indices
=
[
linear_sum_assignment
(
c
.
split
(
sizes
,
-
1
)[
i
].
numpy
())
for
i
,
c
in
enumerate
(
C
)
]
return
[(
paddle
.
to_tensor
(
i
,
dtype
=
paddle
.
int64
),
paddle
.
to_tensor
(
j
,
dtype
=
paddle
.
int64
))
for
i
,
j
in
indices
]
ppdet/modeling/transformers/position_encoding.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
import
paddle.nn
as
nn
from
ppdet.core.workspace
import
register
,
serializable
@
register
@
serializable
class
PositionEmbedding
(
nn
.
Layer
):
def
__init__
(
self
,
num_pos_feats
=
128
,
temperature
=
10000
,
normalize
=
True
,
scale
=
None
,
embed_type
=
'sine'
,
num_embeddings
=
50
):
super
(
PositionEmbedding
,
self
).
__init__
()
assert
embed_type
in
[
'sine'
,
'learned'
]
self
.
embed_type
=
embed_type
if
self
.
embed_type
==
'sine'
:
self
.
num_pos_feats
=
num_pos_feats
self
.
temperature
=
temperature
self
.
normalize
=
normalize
if
scale
is
not
None
and
normalize
is
False
:
raise
ValueError
(
"normalize should be True if scale is passed"
)
if
scale
is
None
:
scale
=
2
*
math
.
pi
self
.
scale
=
scale
elif
self
.
embed_type
==
'learned'
:
self
.
row_embed
=
nn
.
Embedding
(
num_embeddings
,
num_pos_feats
)
self
.
col_embed
=
nn
.
Embedding
(
num_embeddings
,
num_pos_feats
)
else
:
raise
ValueError
(
f
"not supported
{
self
.
embed_type
}
"
)
def
forward
(
self
,
mask
):
"""
Args:
mask (Tensor): [B, H, W]
Returns:
pos (Tensor): [B, C, H, W]
"""
assert
mask
.
dtype
==
paddle
.
bool
if
self
.
embed_type
==
'sine'
:
mask
=
mask
.
astype
(
'float32'
)
y_embed
=
mask
.
cumsum
(
1
,
dtype
=
'float32'
)
x_embed
=
mask
.
cumsum
(
2
,
dtype
=
'float32'
)
if
self
.
normalize
:
eps
=
1e-6
y_embed
=
y_embed
/
(
y_embed
[:,
-
1
:,
:]
+
eps
)
*
self
.
scale
x_embed
=
x_embed
/
(
x_embed
[:,
:,
-
1
:]
+
eps
)
*
self
.
scale
dim_t
=
2
*
(
paddle
.
arange
(
self
.
num_pos_feats
)
//
2
).
astype
(
'float32'
)
dim_t
=
self
.
temperature
**
(
dim_t
/
self
.
num_pos_feats
)
pos_x
=
x_embed
.
unsqueeze
(
-
1
)
/
dim_t
pos_y
=
y_embed
.
unsqueeze
(
-
1
)
/
dim_t
pos_x
=
paddle
.
stack
(
(
pos_x
[:,
:,
:,
0
::
2
].
sin
(),
pos_x
[:,
:,
:,
1
::
2
].
cos
()),
axis
=
4
).
flatten
(
3
)
pos_y
=
paddle
.
stack
(
(
pos_y
[:,
:,
:,
0
::
2
].
sin
(),
pos_y
[:,
:,
:,
1
::
2
].
cos
()),
axis
=
4
).
flatten
(
3
)
pos
=
paddle
.
concat
((
pos_y
,
pos_x
),
axis
=
3
).
transpose
([
0
,
3
,
1
,
2
])
return
pos
elif
self
.
embed_type
==
'learned'
:
h
,
w
=
mask
.
shape
[
-
2
:]
i
=
paddle
.
arange
(
w
)
j
=
paddle
.
arange
(
h
)
x_emb
=
self
.
col_embed
(
i
)
y_emb
=
self
.
row_embed
(
j
)
pos
=
paddle
.
concat
(
[
x_emb
.
unsqueeze
(
0
).
repeat
(
h
,
1
,
1
),
y_emb
.
unsqueeze
(
1
).
repeat
(
1
,
w
,
1
),
],
axis
=-
1
).
transpose
([
2
,
0
,
1
]).
unsqueeze
(
0
).
tile
(
mask
.
shape
[
0
],
1
,
1
,
1
)
return
pos
else
:
raise
ValueError
(
f
"not supported
{
self
.
embed_type
}
"
)
ppdet/modeling/transformers/utils.py
0 → 100644
浏览文件 @
3c1c576d
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
copy
import
paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
..bbox_utils
import
bbox_overlaps
__all__
=
[
'_get_clones'
,
'bbox_overlaps'
,
'bbox_cxcywh_to_xyxy'
,
'bbox_xyxy_to_cxcywh'
,
'sigmoid_focal_loss'
]
def
_get_clones
(
module
,
N
):
return
nn
.
LayerList
([
copy
.
deepcopy
(
module
)
for
_
in
range
(
N
)])
def
bbox_cxcywh_to_xyxy
(
x
):
x_c
,
y_c
,
w
,
h
=
x
.
unbind
(
-
1
)
b
=
[(
x_c
-
0.5
*
w
),
(
y_c
-
0.5
*
h
),
(
x_c
+
0.5
*
w
),
(
y_c
+
0.5
*
h
)]
return
paddle
.
stack
(
b
,
axis
=-
1
)
def
bbox_xyxy_to_cxcywh
(
x
):
x0
,
y0
,
x1
,
y1
=
x
.
unbind
(
-
1
)
b
=
[(
x0
+
x1
)
/
2
,
(
y0
+
y1
)
/
2
,
(
x1
-
x0
),
(
y1
-
y0
)]
return
paddle
.
stack
(
b
,
axis
=-
1
)
def
sigmoid_focal_loss
(
logit
,
label
,
normalizer
=
1.0
,
alpha
=
0.25
,
gamma
=
2.0
):
prob
=
F
.
sigmoid
(
logit
)
ce_loss
=
F
.
binary_cross_entropy_with_logits
(
logit
,
label
,
reduction
=
"none"
)
p_t
=
prob
*
label
+
(
1
-
prob
)
*
(
1
-
label
)
loss
=
ce_loss
*
((
1
-
p_t
)
**
gamma
)
if
alpha
>=
0
:
alpha_t
=
alpha
*
label
+
(
1
-
alpha
)
*
(
1
-
label
)
loss
=
alpha_t
*
loss
return
loss
.
mean
(
1
).
sum
()
/
normalizer
if
normalizer
>
1.
else
loss
.
mean
(
1
).
sum
()
ppdet/optimizer.py
浏览文件 @
3c1c576d
...
...
@@ -244,10 +244,11 @@ class OptimizerBuilder():
optim_args
=
self
.
optimizer
.
copy
()
optim_type
=
optim_args
[
'type'
]
del
optim_args
[
'type'
]
if
optim_type
!=
'AdamW'
:
optim_args
[
'weight_decay'
]
=
regularization
op
=
getattr
(
optimizer
,
optim_type
)
return
op
(
learning_rate
=
learning_rate
,
parameters
=
params
,
weight_decay
=
regularization
,
grad_clip
=
grad_clip
,
**
optim_args
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录