未验证 提交 3c1c576d 编写于 作者: S shangliang Xu 提交者: GitHub

[Transformer] Add transformer base code (#3612)

* Add DETR

* drop return_pad_mask in PadBatch
上级 43515234
......@@ -40,6 +40,7 @@ from PIL import Image, ImageEnhance, ImageDraw
from ppdet.core.workspace import serializable
from ppdet.modeling.layers import AnchorGrid
from ppdet.modeling import bbox_utils
from ..reader import Compose
from .op_helper import (satisfy_sample_constraint, filter_and_process,
generate_sample_bbox, clip_bbox, data_anchor_sampling,
......@@ -2348,7 +2349,7 @@ class RandomResizeCrop(BaseOperator):
for gt_segm in sample['gt_segm']
]
sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
return sample
......@@ -2528,3 +2529,354 @@ class Mosaic(BaseOperator):
sample['difficult'] = difficult
return sample
@register_op
class RandomSelect(BaseOperator):
"""
Randomly choose a transformation between transforms1 and transforms2,
and the probability of choosing transforms1 is p.
"""
def __init__(self, transforms1, transforms2, p=0.5):
super(RandomSelect, self).__init__()
self.transforms1 = Compose(transforms1)
self.transforms2 = Compose(transforms2)
self.p = p
def apply(self, sample, context=None):
if random.random() < self.p:
return self.transforms1(sample)
return self.transforms2(sample)
@register_op
class RandomShortSideResize(BaseOperator):
def __init__(self,
short_side_sizes,
max_size=None,
interp=cv2.INTER_LINEAR,
random_interp=False):
"""
Resize the image randomly according to the short side. If max_size is not None,
the long side is scaled according to max_size. The whole process will be keep ratio.
Args:
short_side_sizes (list|tuple): Image target short side size.
max_size (int): The size of the longest side of image after resize.
interp (int): The interpolation method.
random_interp (bool): Whether random select interpolation method.
"""
super(RandomShortSideResize, self).__init__()
assert isinstance(short_side_sizes,
Sequence), "short_side_sizes must be List or Tuple"
self.short_side_sizes = short_side_sizes
self.max_size = max_size
self.interp = interp
self.random_interp = random_interp
self.interps = [
cv2.INTER_NEAREST,
cv2.INTER_LINEAR,
cv2.INTER_AREA,
cv2.INTER_CUBIC,
cv2.INTER_LANCZOS4,
]
def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
h, w = image_shape
if max_size is not None:
min_original_size = float(min((w, h)))
max_original_size = float(max((w, h)))
if max_original_size / min_original_size * size > max_size:
size = int(
round(max_size * min_original_size / max_original_size))
if (w <= h and w == size) or (h <= w and h == size):
return (w, h)
if w < h:
ow = size
oh = int(size * h / w)
else:
oh = size
ow = int(size * w / h)
return (ow, oh)
def resize(self,
sample,
target_size,
max_size=None,
interp=cv2.INTER_LINEAR):
im = sample['image']
if not isinstance(im, np.ndarray):
raise TypeError("{}: image type is not numpy.".format(self))
if len(im.shape) != 3:
raise ImageError('{}: image is not 3-dimensional.'.format(self))
target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,
max_size)
im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
0] / im.shape[1]
sample['image'] = cv2.resize(im, target_size, interpolation=interp)
sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
if 'scale_factor' in sample:
scale_factor = sample['scale_factor']
sample['scale_factor'] = np.asarray(
[scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
dtype=np.float32)
else:
sample['scale_factor'] = np.asarray(
[im_scale_y, im_scale_x], dtype=np.float32)
# apply bbox
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_bbox(
sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],
[im_scale_x, im_scale_y])
# apply semantic
if 'semantic' in sample and sample['semantic']:
semantic = sample['semantic']
semantic = cv2.resize(
semantic.astype('float32'),
target_size,
interpolation=self.interp)
semantic = np.asarray(semantic).astype('int32')
semantic = np.expand_dims(semantic, 0)
sample['semantic'] = semantic
# apply gt_segm
if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
masks = [
cv2.resize(
gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
for gt_segm in sample['gt_segm']
]
sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
return sample
def apply_bbox(self, bbox, scale, size):
im_scale_x, im_scale_y = scale
resize_w, resize_h = size
bbox[:, 0::2] *= im_scale_x
bbox[:, 1::2] *= im_scale_y
bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
return bbox.astype('float32')
def apply_segm(self, segms, im_size, scale):
def _resize_poly(poly, im_scale_x, im_scale_y):
resized_poly = np.array(poly).astype('float32')
resized_poly[0::2] *= im_scale_x
resized_poly[1::2] *= im_scale_y
return resized_poly.tolist()
def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
if 'counts' in rle and type(rle['counts']) == list:
rle = mask_util.frPyObjects(rle, im_h, im_w)
mask = mask_util.decode(rle)
mask = cv2.resize(
mask,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
return rle
im_h, im_w = im_size
im_scale_x, im_scale_y = scale
resized_segms = []
for segm in segms:
if is_poly(segm):
# Polygon format
resized_segms.append([
_resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
])
else:
# RLE format
import pycocotools.mask as mask_util
resized_segms.append(
_resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
return resized_segms
def apply(self, sample, context=None):
target_size = random.choice(self.short_side_sizes)
interp = random.choice(
self.interps) if self.random_interp else self.interp
return self.resize(sample, target_size, self.max_size, interp)
@register_op
class RandomSizeCrop(BaseOperator):
"""
Cut the image randomly according to `min_size` and `max_size`
"""
def __init__(self, min_size, max_size):
super(RandomSizeCrop, self).__init__()
self.min_size = min_size
self.max_size = max_size
from paddle.vision.transforms.functional import crop as paddle_crop
self.paddle_crop = paddle_crop
@staticmethod
def get_crop_params(img_shape, output_size):
"""Get parameters for ``crop`` for a random crop.
Args:
img_shape (list|tuple): Image's height and width.
output_size (list|tuple): Expected output size of the crop.
Returns:
tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
"""
h, w = img_shape
th, tw = output_size
if h + 1 < th or w + 1 < tw:
raise ValueError(
"Required crop size {} is larger then input image size {}".
format((th, tw), (h, w)))
if w == tw and h == th:
return 0, 0, h, w
i = random.randint(0, h - th + 1)
j = random.randint(0, w - tw + 1)
return i, j, th, tw
def crop(self, sample, region):
image_shape = sample['image'].shape[:2]
sample['image'] = self.paddle_crop(sample['image'], *region)
keep_index = None
# apply bbox
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], region)
bbox = sample['gt_bbox'].reshape([-1, 2, 2])
area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
keep_index = np.where(area > 0)[0]
sample['gt_bbox'] = sample['gt_bbox'][keep_index] if len(
keep_index) > 0 else np.zeros(
[0, 4], dtype=np.float32)
sample['gt_class'] = sample['gt_class'][keep_index] if len(
keep_index) > 0 else np.zeros(
[0, 1], dtype=np.float32)
if 'gt_score' in sample:
sample['gt_score'] = sample['gt_score'][keep_index] if len(
keep_index) > 0 else np.zeros(
[0, 1], dtype=np.float32)
if 'is_crowd' in sample:
sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
keep_index) > 0 else np.zeros(
[0, 1], dtype=np.float32)
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
image_shape)
if keep_index is not None:
sample['gt_poly'] = sample['gt_poly'][keep_index]
# apply gt_segm
if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
i, j, h, w = region
sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
if keep_index is not None:
sample['gt_segm'] = sample['gt_segm'][keep_index]
return sample
def apply_bbox(self, bbox, region):
i, j, h, w = region
region_size = np.asarray([w, h])
crop_bbox = bbox - np.asarray([j, i, j, i])
crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
crop_bbox = crop_bbox.clip(min=0)
return crop_bbox.reshape([-1, 4]).astype('float32')
def apply_segm(self, segms, region, image_shape):
def _crop_poly(segm, crop):
xmin, ymin, xmax, ymax = crop
crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
crop_p = np.array(crop_coord).reshape(4, 2)
crop_p = Polygon(crop_p)
crop_segm = list()
for poly in segm:
poly = np.array(poly).reshape(len(poly) // 2, 2)
polygon = Polygon(poly)
if not polygon.is_valid:
exterior = polygon.exterior
multi_lines = exterior.intersection(exterior)
polygons = shapely.ops.polygonize(multi_lines)
polygon = MultiPolygon(polygons)
multi_polygon = list()
if isinstance(polygon, MultiPolygon):
multi_polygon = copy.deepcopy(polygon)
else:
multi_polygon.append(copy.deepcopy(polygon))
for per_polygon in multi_polygon:
inter = per_polygon.intersection(crop_p)
if not inter:
continue
if isinstance(inter, (MultiPolygon, GeometryCollection)):
for part in inter:
if not isinstance(part, Polygon):
continue
part = np.squeeze(
np.array(part.exterior.coords[:-1]).reshape(1,
-1))
part[0::2] -= xmin
part[1::2] -= ymin
crop_segm.append(part.tolist())
elif isinstance(inter, Polygon):
crop_poly = np.squeeze(
np.array(inter.exterior.coords[:-1]).reshape(1, -1))
crop_poly[0::2] -= xmin
crop_poly[1::2] -= ymin
crop_segm.append(crop_poly.tolist())
else:
continue
return crop_segm
def _crop_rle(rle, crop, height, width):
if 'counts' in rle and type(rle['counts']) == list:
rle = mask_util.frPyObjects(rle, height, width)
mask = mask_util.decode(rle)
mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
return rle
i, j, h, w = region
crop = [j, i, j + w, i + h]
height, width = image_shape
crop_segms = []
for segm in segms:
if is_poly(segm):
import copy
import shapely.ops
from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
# Polygon format
crop_segms.append(_crop_poly(segm, crop))
else:
# RLE format
import pycocotools.mask as mask_util
crop_segms.append(_crop_rle(segm, crop, height, width))
return crop_segms
def apply(self, sample, context=None):
h = random.randint(self.min_size,
min(sample['image'].shape[0], self.max_size))
w = random.randint(self.min_size,
min(sample['image'].shape[1], self.max_size))
region = self.get_crop_params(sample['image'].shape[:2], [h, w])
return self.crop(sample, region)
......@@ -27,6 +27,7 @@ from . import post_process
from . import layers
from . import reid
from . import mot
from . import transformers
from .ops import *
from .backbones import *
......@@ -39,3 +40,4 @@ from .post_process import *
from .layers import *
from .reid import *
from .mot import *
from .transformers import *
......@@ -21,6 +21,7 @@ from . import jde
from . import deepsort
from . import fairmot
from . import centernet
from . import detr
from .meta_arch import *
from .faster_rcnn import *
......@@ -39,3 +40,4 @@ from .deepsort import *
from .fairmot import *
from .centernet import *
from .blazeface import *
from .detr import *
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from .meta_arch import BaseArch
from ppdet.core.workspace import register, create
__all__ = ['DETR']
@register
class DETR(BaseArch):
__category__ = 'architecture'
__inject__ = ['post_process']
def __init__(self,
backbone,
transformer,
detr_head,
post_process='DETRBBoxPostProcess'):
super(DETR, self).__init__()
self.backbone = backbone
self.transformer = transformer
self.detr_head = detr_head
self.post_process = post_process
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# transformer
kwargs = {'input_shape': backbone.out_shape}
transformer = create(cfg['transformer'], **kwargs)
# head
kwargs = {
'hidden_dim': transformer.hidden_dim,
'nhead': transformer.nhead,
'input_shape': backbone.out_shape
}
detr_head = create(cfg['detr_head'], **kwargs)
return {
'backbone': backbone,
'transformer': transformer,
"detr_head": detr_head,
}
def _forward(self):
# Backbone
body_feats = self.backbone(self.inputs)
# Transformer
out_transformer = self.transformer(body_feats, self.inputs['pad_mask'])
# DETR Head
if self.training:
return self.detr_head(out_transformer, body_feats, self.inputs)
else:
preds = self.detr_head(out_transformer, body_feats)
bbox, bbox_num = self.post_process(preds, self.inputs['im_shape'],
self.inputs['scale_factor'])
return bbox, bbox_num
def get_loss(self, ):
losses = self._forward()
losses.update({
'loss':
paddle.add_n([v for k, v in losses.items() if 'log' not in k])
})
return losses
def get_pred(self):
bbox_pred, bbox_num = self._forward()
output = {
"bbox": bbox_pred,
"bbox_num": bbox_num,
}
return output
......@@ -25,6 +25,7 @@ from . import face_head
from . import s2anet_head
from . import keypoint_hrhrnet_head
from . import centernet_head
from . import detr_head
from .bbox_head import *
from .mask_head import *
......@@ -39,3 +40,4 @@ from .face_head import *
from .s2anet_head import *
from .keypoint_hrhrnet_head import *
from .centernet_head import *
from .detr_head import *
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
import pycocotools.mask as mask_util
from ..initializer import *
__all__ = ['DETRHead']
class MLP(nn.Layer):
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.LayerList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
self._reset_parameters()
def _reset_parameters(self):
for l in self.layers:
linear_init_(l)
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class MultiHeadAttentionMap(nn.Layer):
"""This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
bias=True):
super().__init__()
self.num_heads = num_heads
self.hidden_dim = hidden_dim
self.dropout = nn.Dropout(dropout)
weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform())
bias_attr = paddle.framework.ParamAttr(
initializer=paddle.nn.initializer.Constant()) if bias else False
self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
self.k_proj = nn.Conv2D(
query_dim,
hidden_dim,
1,
weight_attr=weight_attr,
bias_attr=bias_attr)
self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
def forward(self, q, k, mask=None):
q = self.q_proj(q)
k = self.k_proj(k)
bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
qh = q.reshape([bs, num_queries, n, c])
kh = k.reshape([bs, n, c, h, w])
# weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
kh = kh.reshape([-1, c, h * w])
weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
[bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
if mask is not None:
weights += mask
# fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
weights = self.dropout(weights)
return weights
class MaskHeadFPNConv(nn.Layer):
"""
Simple convolutional head, using group norm.
Upsampling is done using a FPN approach
"""
def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
super().__init__()
inter_dims = [input_dim,
] + [context_dim // (2**i) for i in range(1, 5)]
weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.KaimingUniform())
bias_attr = paddle.framework.ParamAttr(
initializer=paddle.nn.initializer.Constant())
self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
weight_attr, bias_attr)
self.conv_inter = nn.LayerList()
for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
self.conv_inter.append(
self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
bias_attr))
self.conv_out = nn.Conv2D(
inter_dims[-1],
1,
3,
padding=1,
weight_attr=weight_attr,
bias_attr=bias_attr)
self.adapter = nn.LayerList()
for i in range(len(fpn_dims)):
self.adapter.append(
nn.Conv2D(
fpn_dims[i],
inter_dims[i + 1],
1,
weight_attr=weight_attr,
bias_attr=bias_attr))
def _make_layers(self,
in_dims,
out_dims,
kernel_size,
num_groups,
weight_attr=None,
bias_attr=None):
return nn.Sequential(
nn.Conv2D(
in_dims,
out_dims,
kernel_size,
padding=kernel_size // 2,
weight_attr=weight_attr,
bias_attr=bias_attr),
nn.GroupNorm(num_groups, out_dims),
nn.ReLU())
def forward(self, x, bbox_attention_map, fpns):
x = paddle.concat([
x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
bbox_attention_map.flatten(0, 1)
], 1)
x = self.conv0(x)
for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
self.adapter, fpns):
feat = adapter_layer(feat).tile(
[bbox_attention_map.shape[1], 1, 1, 1])
x = inter_layer(x)
x = feat + F.interpolate(x, size=feat.shape[-2:])
x = self.conv_inter[-1](x)
x = self.conv_out(x)
return x
@register
class DETRHead(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
__inject__ = ['loss']
def __init__(self,
num_classes=80,
hidden_dim=256,
nhead=8,
num_mlp_layers=3,
loss='DETRLoss',
fpn_dims=[1024, 512, 256],
with_mask_head=False,
use_focal_loss=False):
super(DETRHead, self).__init__()
# add background class
self.num_classes = num_classes if use_focal_loss else num_classes + 1
self.hidden_dim = hidden_dim
self.loss = loss
self.with_mask_head = with_mask_head
self.use_focal_loss = use_focal_loss
self.score_head = nn.Linear(hidden_dim, self.num_classes)
self.bbox_head = MLP(hidden_dim,
hidden_dim,
output_dim=4,
num_layers=num_mlp_layers)
if self.with_mask_head:
self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
nhead)
self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
hidden_dim)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.score_head)
@classmethod
def from_config(cls, cfg, hidden_dim, nhead, input_shape):
return {
'hidden_dim': hidden_dim,
'nhead': nhead,
'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
}
@staticmethod
def get_gt_mask_from_polygons(gt_poly, pad_mask):
out_gt_mask = []
for polygons, padding in zip(gt_poly, pad_mask):
height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
masks = []
for obj_poly in polygons:
rles = mask_util.frPyObjects(obj_poly, height, width)
rle = mask_util.merge(rles)
masks.append(
paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
masks = paddle.stack(masks)
masks_pad = paddle.zeros(
[masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
masks_pad[:, :height, :width] = masks
out_gt_mask.append(masks_pad)
return out_gt_mask
def forward(self, out_transformer, body_feats, inputs=None):
r"""
Args:
out_transformer (Tuple): (feats: [num_levels, batch_size,
num_queries, hidden_dim],
memory: [batch_size, hidden_dim, h, w],
src_proj: [batch_size, h*w, hidden_dim],
src_mask: [batch_size, 1, 1, h, w])
body_feats (List(Tensor)): list[[B, C, H, W]]
inputs (dict): dict(inputs)
"""
feats, memory, src_proj, src_mask = out_transformer
outputs_logit = self.score_head(feats)
outputs_bbox = F.sigmoid(self.bbox_head(feats))
outputs_seg = None
if self.with_mask_head:
bbox_attention_map = self.bbox_attention(feats[-1], memory,
src_mask)
fpn_feats = [a for a in body_feats[::-1]][1:]
outputs_seg = self.mask_head(src_proj, bbox_attention_map,
fpn_feats)
outputs_seg = outputs_seg.reshape([
feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
outputs_seg.shape[-1]
])
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
gt_mask = self.get_gt_mask_from_polygons(
inputs['gt_poly'],
inputs['pad_mask']) if 'gt_poly' in inputs else None
return self.loss(
outputs_bbox,
outputs_logit,
inputs['gt_bbox'],
inputs['gt_class'],
masks=outputs_seg,
gt_mask=gt_mask)
else:
return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
......@@ -28,6 +28,8 @@ __all__ = [
'xavier_normal_',
'kaiming_uniform_',
'kaiming_normal_',
'linear_init_',
'conv_init_',
'reset_initialized_parameter',
]
......@@ -46,7 +48,7 @@ def _no_grad_normal_(tensor, mean=0., std=1.):
return tensor
def _no_grad_fill_(tensor, value=0):
def _no_grad_fill_(tensor, value=0.):
with paddle.no_grad():
v = paddle.rand(shape=tensor.shape, dtype=tensor.dtype)
v[...] = value
......@@ -80,7 +82,7 @@ def normal_(tensor, mean=0., std=1.):
return _no_grad_normal_(tensor, mean, std)
def constant_(tensor, value=0):
def constant_(tensor, value=0.):
"""
Modified tensor inspace using constant_
Args:
......@@ -150,7 +152,7 @@ def xavier_uniform_(tensor, gain=1., reverse=False):
Modified tensor inspace using xavier_uniform_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (str): super parameter, 1. default.
gain (float): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
......@@ -166,7 +168,7 @@ def xavier_normal_(tensor, gain=1., reverse=False):
Modified tensor inspace using xavier_normal_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (str): super parameter, 1. default.
gain (float): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
......@@ -260,6 +262,18 @@ def kaiming_normal_(tensor,
return _no_grad_normal_(tensor, 0, std)
def linear_init_(module):
bound = 1 / math.sqrt(module.weight.shape[0])
uniform_(module.weight, -bound, bound)
uniform_(module.bias, -bound, bound)
def conv_init_(module):
bound = 1 / math.sqrt(math.prod(module.weight.shape[1:]))
uniform_(module.weight, -bound, bound)
uniform_(module.bias, -bound, bound)
@paddle.no_grad()
def reset_initialized_parameter(model, include_self=True):
"""
......
......@@ -29,8 +29,11 @@ from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from ppdet.modeling.bbox_utils import delta2bbox
from . import ops
from .initializer import xavier_uniform_, constant_
from paddle.vision.ops import DeformConv2D
from paddle.nn.layer import transformer
_convert_attention_mask = transformer._convert_attention_mask
def _to_list(l):
......@@ -1187,3 +1190,179 @@ class Concat(nn.Layer):
def extra_repr(self):
return 'dim={}'.format(self.dim)
class MultiHeadAttention(nn.Layer):
"""
Attention mapps queries and a set of key-value pairs to outputs, and
Multi-Head Attention performs multiple parallel attention to jointly attending
to information from different representation subspaces.
Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
for more details.
Parameters:
embed_dim (int): The expected feature size in the input and output.
num_heads (int): The number of heads in multi-head attention.
dropout (float, optional): The dropout probability used on attention
weights to drop some attention targets. 0 for no dropout. Default 0
kdim (int, optional): The feature size in key. If None, assumed equal to
`embed_dim`. Default None.
vdim (int, optional): The feature size in value. If None, assumed equal to
`embed_dim`. Default None.
need_weights (bool, optional): Indicate whether to return the attention
weights. Default False.
Examples:
.. code-block:: python
import paddle
# encoder input: [batch_size, sequence_length, d_model]
query = paddle.rand((2, 4, 128))
# self attention mask: [batch_size, num_heads, query_len, query_len]
attn_mask = paddle.rand((2, 2, 4, 4))
multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
"""
def __init__(self,
embed_dim,
num_heads,
dropout=0.,
kdim=None,
vdim=None,
need_weights=False):
super(MultiHeadAttention, self).__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.need_weights = need_weights
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
if self._qkv_same_embed_dim:
self.in_proj_weight = self.create_parameter(
shape=[embed_dim, 3 * embed_dim],
attr=None,
dtype=self._dtype,
is_bias=False)
self.in_proj_bias = self.create_parameter(
shape=[3 * embed_dim],
attr=None,
dtype=self._dtype,
is_bias=True)
else:
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.k_proj = nn.Linear(self.kdim, embed_dim)
self.v_proj = nn.Linear(self.vdim, embed_dim)
self.out_proj = nn.Linear(embed_dim, embed_dim)
self._type_list = ('q_proj', 'k_proj', 'v_proj')
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
xavier_uniform_(p)
else:
constant_(p)
def compute_qkv(self, tensor, index):
if self._qkv_same_embed_dim:
tensor = F.linear(
x=tensor,
weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
* self.embed_dim],
bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
self.embed_dim]
if self.in_proj_bias is not None else None)
else:
tensor = getattr(self, self._type_list[index])(tensor)
tensor = tensor.reshape(
[0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
return tensor
def forward(self, query, key=None, value=None, attn_mask=None):
r"""
Applies multi-head attention to map queries and a set of key-value pairs
to outputs.
Parameters:
query (Tensor): The queries for multi-head attention. It is a
tensor with shape `[batch_size, query_length, embed_dim]`. The
data type should be float32 or float64.
key (Tensor, optional): The keys for multi-head attention. It is
a tensor with shape `[batch_size, key_length, kdim]`. The
data type should be float32 or float64. If None, use `query` as
`key`. Default None.
value (Tensor, optional): The values for multi-head attention. It
is a tensor with shape `[batch_size, value_length, vdim]`.
The data type should be float32 or float64. If None, use `query` as
`value`. Default None.
attn_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
When the data type is bool, the unwanted positions have `False`
values and the others have `True` values. When the data type is
int, the unwanted positions have 0 values and the others have 1
values. When the data type is float, the unwanted positions have
`-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None.
Returns:
Tensor|tuple: It is a tensor that has the same shape and data type \
as `query`, representing attention output. Or a tuple if \
`need_weights` is True or `cache` is not None. If `need_weights` \
is True, except for attention output, the tuple also includes \
the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
If `cache` is not None, the tuple then includes the new cache \
having the same type as `cache`, and if it is `StaticCache`, it \
is same as the input `cache`, if it is `Cache`, the new cache \
reserves tensors concatanating raw tensors with intermediate \
results of current query.
"""
key = query if key is None else key
value = query if value is None else value
# compute q ,k ,v
q, k, v = (self.compute_qkv(t, i)
for i, t in enumerate([query, key, value]))
# scale dot product attention
product = paddle.matmul(x=q, y=k, transpose_y=True)
scaling = float(self.head_dim)**-0.5
product = product * scaling
if attn_mask is not None:
# Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, product.dtype)
product = product + attn_mask
weights = F.softmax(product)
if self.dropout:
weights = F.dropout(
weights,
self.dropout,
training=self.training,
mode="upscale_in_train")
out = paddle.matmul(weights, v)
# combine heads
out = paddle.transpose(out, perm=[0, 2, 1, 3])
out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
# project to output
out = self.out_proj(out)
outs = [out]
if self.need_weights:
outs.append(weights)
return out if len(outs) == 1 else tuple(outs)
......@@ -22,6 +22,7 @@ from . import ctfocal_loss
from . import keypoint_loss
from . import jde_loss
from . import fairmot_loss
from . import detr_loss
from .yolo_loss import *
from .iou_aware_loss import *
......@@ -33,3 +34,4 @@ from .ctfocal_loss import *
from .keypoint_loss import *
from .jde_loss import *
from .fairmot_loss import *
from .detr_loss import *
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from .iou_loss import GIoULoss
from ..transformers import bbox_cxcywh_to_xyxy, bbox_overlaps, sigmoid_focal_loss
__all__ = ['DETRLoss']
@register
class DETRLoss(nn.Layer):
__shared__ = ['num_classes', 'use_focal_loss']
__inject__ = ['matcher']
def __init__(self,
num_classes=80,
matcher='HungarianMatcher',
loss_coeff={
'class': 1,
'bbox': 5,
'giou': 2,
'no_object': 0.1,
'mask': 1,
'dice': 1
},
aux_loss=True,
use_focal_loss=False):
r"""
Args:
num_classes (int): The number of classes.
matcher (HungarianMatcher): It computes an assignment between the targets
and the predictions of the network.
loss_coeff (dict): The coefficient of loss.
aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
use_focal_loss (bool): Use focal loss or not.
"""
super(DETRLoss, self).__init__()
self.num_classes = num_classes
self.matcher = matcher
self.loss_coeff = loss_coeff
self.aux_loss = aux_loss
self.use_focal_loss = use_focal_loss
if not self.use_focal_loss:
self.loss_coeff['class'] = paddle.full([num_classes + 1],
loss_coeff['class'])
self.loss_coeff['class'][-1] = loss_coeff['no_object']
self.giou_loss = GIoULoss()
def _get_loss_class(self, logits, gt_class, match_indices, bg_index,
num_gts):
# logits: [b, query, num_classes], gt_class: list[[n, 1]]
target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
bs, num_query_objects = target_label.shape
if sum(len(a) for a in gt_class) > 0:
index, updates = self._get_index_updates(num_query_objects,
gt_class, match_indices)
target_label = paddle.scatter(
target_label.reshape([-1, 1]), index, updates.astype('int64'))
target_label = target_label.reshape([bs, num_query_objects])
if self.use_focal_loss:
target_label = F.one_hot(target_label,
self.num_classes + 1)[:, :, :-1]
return {
'loss_class': self.loss_coeff['class'] * sigmoid_focal_loss(
logits, target_label, num_gts / num_query_objects)
if self.use_focal_loss else F.cross_entropy(
logits, target_label, weight=self.loss_coeff['class'])
}
def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts):
# boxes: [b, query, 4], gt_bbox: list[[n, 4]]
loss = dict()
if sum(len(a) for a in gt_bbox) == 0:
loss['loss_bbox'] = paddle.to_tensor([0.])
loss['loss_giou'] = paddle.to_tensor([0.])
return loss
src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
match_indices)
loss['loss_bbox'] = self.loss_coeff['bbox'] * F.l1_loss(
src_bbox, target_bbox, reduction='sum') / num_gts
loss['loss_giou'] = self.giou_loss(
bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
loss['loss_giou'] = loss['loss_giou'].sum() / num_gts
loss['loss_giou'] = self.loss_coeff['giou'] * loss['loss_giou']
return loss
def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts):
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
loss = dict()
if sum(len(a) for a in gt_mask) == 0:
loss['loss_mask'] = paddle.to_tensor([0.])
loss['loss_dice'] = paddle.to_tensor([0.])
return loss
src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
match_indices)
src_masks = F.interpolate(
src_masks.unsqueeze(0),
size=target_masks.shape[-2:],
mode="bilinear")[0]
loss['loss_mask'] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
src_masks,
target_masks,
paddle.to_tensor(
[num_gts], dtype='float32'))
loss['loss_dice'] = self.loss_coeff['dice'] * self._dice_loss(
src_masks, target_masks, num_gts)
return loss
def _dice_loss(self, inputs, targets, num_gts):
inputs = F.sigmoid(inputs)
inputs = inputs.flatten(1)
targets = targets.flatten(1)
numerator = 2 * (inputs * targets).sum(1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_gts
def _get_loss_aux(self, boxes, logits, gt_bbox, gt_class, bg_index,
num_gts):
loss_class = []
loss_bbox = []
loss_giou = []
for aux_boxes, aux_logits in zip(boxes, logits):
match_indices = self.matcher(aux_boxes, aux_logits, gt_bbox,
gt_class)
loss_class.append(
self._get_loss_class(aux_logits, gt_class, match_indices,
bg_index, num_gts)['loss_class'])
loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
num_gts)
loss_bbox.append(loss_['loss_bbox'])
loss_giou.append(loss_['loss_giou'])
loss = {
'loss_class_aux': paddle.add_n(loss_class),
'loss_bbox_aux': paddle.add_n(loss_bbox),
'loss_giou_aux': paddle.add_n(loss_giou)
}
return loss
def _get_index_updates(self, num_query_objects, target, match_indices):
batch_idx = paddle.concat([
paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
])
src_idx = paddle.concat([src for (src, _) in match_indices])
src_idx += (batch_idx * num_query_objects)
target_assign = paddle.concat([
paddle.gather(
t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
])
return src_idx, target_assign
def _get_src_target_assign(self, src, target, match_indices):
src_assign = paddle.concat([
paddle.gather(
t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
for t, (I, _) in zip(src, match_indices)
])
target_assign = paddle.concat([
paddle.gather(
t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
for t, (_, J) in zip(target, match_indices)
])
return src_assign, target_assign
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None):
r"""
Args:
boxes (Tensor): [l, b, query, 4]
logits (Tensor): [l, b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
masks (Tensor, optional): [b, query, h, w]
gt_mask (List(Tensor), optional): list[[n, H, W]]
"""
match_indices = self.matcher(boxes[-1].detach(), logits[-1].detach(),
gt_bbox, gt_class)
num_gts = sum(len(a) for a in gt_bbox)
try:
# TODO: Paddle does not have a "paddle.distributed.is_initialized()"
num_gts = paddle.to_tensor([num_gts], dtype=paddle.float32)
paddle.distributed.all_reduce(num_gts)
num_gts = paddle.clip(
num_gts / paddle.distributed.get_world_size(), min=1).item()
except:
num_gts = max(num_gts, 1)
total_loss = dict()
total_loss.update(
self._get_loss_class(logits[-1], gt_class, match_indices,
self.num_classes, num_gts))
total_loss.update(
self._get_loss_bbox(boxes[-1], gt_bbox, match_indices, num_gts))
if masks is not None and gt_mask is not None:
total_loss.update(
self._get_loss_mask(masks, gt_mask, match_indices, num_gts))
if self.aux_loss:
total_loss.update(
self._get_loss_aux(boxes[:-1], logits[:-1], gt_bbox, gt_class,
self.num_classes, num_gts))
return total_loss
......@@ -19,18 +19,16 @@ import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.bbox_utils import nonempty_bbox, rbox2poly, rbox2poly
from ppdet.modeling.layers import TTFBox
from .transformers import bbox_cxcywh_to_xyxy
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
__all__ = [
'BBoxPostProcess',
'MaskPostProcess',
'FCOSPostProcess',
'S2ANetBBoxPostProcess',
'JDEBBoxPostProcess',
'CenterNetPostProcess',
'BBoxPostProcess', 'MaskPostProcess', 'FCOSPostProcess',
'S2ANetBBoxPostProcess', 'JDEBBoxPostProcess', 'CenterNetPostProcess',
'DETRBBoxPostProcess'
]
......@@ -492,3 +490,64 @@ class CenterNetPostProcess(TTFBox):
else:
results = paddle.concat([clses, scores, bboxes], axis=1)
return results, paddle.shape(results)[0:1]
@register
class DETRBBoxPostProcess(object):
__shared__ = ['num_classes', 'use_focal_loss']
__inject__ = []
def __init__(self,
num_classes=80,
num_top_queries=100,
use_focal_loss=False):
super(DETRBBoxPostProcess, self).__init__()
self.num_classes = num_classes
self.num_top_queries = num_top_queries
self.use_focal_loss = use_focal_loss
def __call__(self, head_out, im_shape, scale_factor):
"""
Decode the bbox.
Args:
head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
im_shape (Tensor): The shape of the input image.
scale_factor (Tensor): The scale factor of the input image.
Returns:
bbox_pred (Tensor): The output prediction with shape [N, 6], including
labels, scores and bboxes. The size of bboxes are corresponding
to the input image, the bboxes may be used in other branch.
bbox_num (Tensor): The number of prediction boxes of each batch with
shape [bs], and is N.
"""
bboxes, logits, masks = head_out
bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
img_h, img_w = origin_shape.unbind(1)
origin_shape = paddle.stack(
[img_w, img_h, img_w, img_h], axis=-1).unsqueeze(0)
bbox_pred *= origin_shape
scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
logits)[:, :, :-1]
scores, labels = scores.max(-1), scores.argmax(-1)
if scores.shape[1] > self.num_top_queries:
scores, index = paddle.topk(scores, self.num_top_queries, axis=-1)
labels = paddle.stack(
[paddle.gather(l, i) for l, i in zip(labels, index)])
bbox_pred = paddle.stack(
[paddle.gather(b, i) for b, i in zip(bbox_pred, index)])
bbox_pred = paddle.concat(
[
labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
bbox_pred
],
axis=-1)
bbox_num = paddle.to_tensor(
bbox_pred.shape[1], dtype='int32').tile([bbox_pred.shape[0]])
bbox_pred = bbox_pred.reshape([-1, 6])
return bbox_pred, bbox_num
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import detr_transformer
from . import utils
from . import matchers
from . import position_encoding
from .detr_transformer import *
from .utils import *
from .matchers import *
from .position_encoding import *
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle.nn.layer.transformer import _convert_attention_mask
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding
from .utils import *
from ..initializer import *
__all__ = ['DETRTransformer']
class TransformerEncoderLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerEncoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, src, src_mask=None, pos_embed=None):
src_mask = _convert_attention_mask(src_mask, src.dtype)
residual = src
if self.normalize_before:
src = self.norm1(src)
q = k = self.with_pos_embed(src, pos_embed)
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
src = residual + self.dropout1(src)
if not self.normalize_before:
src = self.norm1(src)
residual = src
if self.normalize_before:
src = self.norm2(src)
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = residual + self.dropout2(src)
if not self.normalize_before:
src = self.norm2(src)
return src
class TransformerEncoder(nn.Layer):
def __init__(self, encoder_layer, num_layers, norm=None):
super(TransformerEncoder, self).__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(self, src, src_mask=None, pos_embed=None):
src_mask = _convert_attention_mask(src_mask, src.dtype)
output = src
for layer in self.layers:
output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
if self.norm is not None:
output = self.norm(output)
return output
class TransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerDecoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self,
tgt,
memory,
tgt_mask=None,
memory_mask=None,
pos_embed=None,
query_pos_embed=None):
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
residual = tgt
if self.normalize_before:
tgt = self.norm1(tgt)
q = k = self.with_pos_embed(tgt, query_pos_embed)
tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
tgt = residual + self.dropout1(tgt)
if not self.normalize_before:
tgt = self.norm1(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm2(tgt)
q = self.with_pos_embed(tgt, query_pos_embed)
k = self.with_pos_embed(memory, pos_embed)
tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
tgt = residual + self.dropout2(tgt)
if not self.normalize_before:
tgt = self.norm2(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm3(tgt)
tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = residual + self.dropout3(tgt)
if not self.normalize_before:
tgt = self.norm3(tgt)
return tgt
class TransformerDecoder(nn.Layer):
def __init__(self,
decoder_layer,
num_layers,
norm=None,
return_intermediate=False):
super(TransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
def forward(self,
tgt,
memory,
tgt_mask=None,
memory_mask=None,
pos_embed=None,
query_pos_embed=None):
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
output = tgt
intermediate = []
for layer in self.layers:
output = layer(
output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
pos_embed=pos_embed,
query_pos_embed=query_pos_embed)
if self.return_intermediate:
intermediate.append(self.norm(output))
if self.norm is not None:
output = self.norm(output)
if self.return_intermediate:
return paddle.stack(intermediate)
return output.unsqueeze(0)
@register
class DETRTransformer(nn.Layer):
__shared__ = ['hidden_dim']
def __init__(self,
num_queries=100,
position_embed_type='sine',
return_intermediate_dec=True,
backbone_num_channels=2048,
hidden_dim=256,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(DETRTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'],\
f'ValueError: position_embed_type not supported {position_embed_type}!'
self.hidden_dim = hidden_dim
self.nhead = nhead
encoder_layer = TransformerEncoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
encoder_norm)
decoder_layer = TransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
decoder_norm = nn.LayerNorm(hidden_dim)
self.decoder = TransformerDecoder(
decoder_layer,
num_decoder_layers,
decoder_norm,
return_intermediate=return_intermediate_dec)
self.input_proj = nn.Conv2D(
backbone_num_channels, hidden_dim, kernel_size=1)
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
self.position_embedding = PositionEmbedding(
hidden_dim // 2,
normalize=True if position_embed_type == 'sine' else False,
embed_type=position_embed_type)
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
xavier_uniform_(p)
conv_init_(self.input_proj)
normal_(self.query_pos_embed.weight)
@classmethod
def from_config(cls, cfg, input_shape):
return {
'backbone_num_channels': [i.channels for i in input_shape][-1],
}
def forward(self, src, src_mask=None):
r"""
Applies a Transformer model on the inputs.
Parameters:
src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
src_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
[bs, H, W]`. When the data type is bool, the unwanted positions
have `False` values and the others have `True` values. When the
data type is int, the unwanted positions have 0 values and the
others have 1 values. When the data type is float, the unwanted
positions have `-INF` values and the others have 0 values. It
can be None when nothing wanted or needed to be prevented
attention to. Default None.
Returns:
output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
memory (Tensor): [batch_size, hidden_dim, h, w]
"""
# use last level feature map
src_proj = self.input_proj(src[-1])
bs, c, h, w = src_proj.shape
# flatten [B, C, H, W] to [B, HxW, C]
src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
if src_mask is not None:
src_mask = F.interpolate(
src_mask.unsqueeze(0).astype(src_flatten.dtype),
size=(h, w))[0].astype('bool')
else:
src_mask = paddle.ones([bs, h, w], dtype='bool')
pos_embed = self.position_embedding(src_mask).flatten(2).transpose(
[0, 2, 1])
src_mask = _convert_attention_mask(src_mask, src_flatten.dtype)
src_mask = src_mask.reshape([bs, 1, 1, -1])
memory = self.encoder(
src_flatten, src_mask=src_mask, pos_embed=pos_embed)
query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
[bs, 1, 1])
tgt = paddle.zeros_like(query_pos_embed)
output = self.decoder(
tgt,
memory,
memory_mask=src_mask,
pos_embed=pos_embed,
query_pos_embed=query_pos_embed)
return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
src_proj, src_mask.reshape([bs, 1, 1, h, w]))
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from scipy.optimize import linear_sum_assignment
from ppdet.core.workspace import register, serializable
from ..losses.iou_loss import GIoULoss
from .utils import bbox_cxcywh_to_xyxy
__all__ = ['HungarianMatcher']
@register
@serializable
class HungarianMatcher(nn.Layer):
__shared__ = ['use_focal_loss']
def __init__(self,
matcher_coeff={'class': 1,
'bbox': 5,
'giou': 2},
use_focal_loss=False,
alpha=0.25,
gamma=2.0):
r"""
Args:
matcher_coeff (dict): The coefficient of hungarian matcher cost.
"""
super(HungarianMatcher, self).__init__()
self.matcher_coeff = matcher_coeff
self.use_focal_loss = use_focal_loss
self.alpha = alpha
self.gamma = gamma
self.giou_loss = GIoULoss()
def forward(self, boxes, logits, gt_bbox, gt_class):
r"""
Args:
boxes (Tensor): [b, query, 4]
logits (Tensor): [b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
bs, num_queries = boxes.shape[:2]
num_gts = sum(len(a) for a in gt_class)
if num_gts == 0:
return [(paddle.to_tensor(
[], dtype=paddle.int64), paddle.to_tensor(
[], dtype=paddle.int64)) for _ in range(bs)]
# We flatten to compute the cost matrices in a batch
# [batch_size * num_queries, num_classes]
out_prob = F.sigmoid(logits.flatten(
0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
# [batch_size * num_queries, 4]
out_bbox = boxes.flatten(0, 1)
# Also concat the target labels and boxes
tgt_ids = paddle.concat(gt_class).flatten()
tgt_bbox = paddle.concat(gt_bbox)
# Compute the classification cost
if self.use_focal_loss:
neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
1 - out_prob + 1e-8).log())
pos_cost_class = self.alpha * (
(1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
cost_class = paddle.gather(
pos_cost_class, tgt_ids, axis=1) - paddle.gather(
neg_cost_class, tgt_ids, axis=1)
else:
cost_class = -paddle.gather(out_prob, tgt_ids, axis=1)
# Compute the L1 cost between boxes
cost_bbox = (
out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
# Compute the giou cost betwen boxes
cost_giou = self.giou_loss(
bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
# Final cost matrix
C = self.matcher_coeff['class'] * cost_class + self.matcher_coeff['bbox'] * cost_bbox + \
self.matcher_coeff['giou'] * cost_giou
C = C.reshape([bs, num_queries, -1])
C = [a.squeeze(0) for a in C.chunk(bs)]
sizes = [a.shape[0] for a in gt_bbox]
indices = [
linear_sum_assignment(c.split(sizes, -1)[i].numpy())
for i, c in enumerate(C)
]
return [(paddle.to_tensor(
i, dtype=paddle.int64), paddle.to_tensor(
j, dtype=paddle.int64)) for i, j in indices]
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
from ppdet.core.workspace import register, serializable
@register
@serializable
class PositionEmbedding(nn.Layer):
def __init__(self,
num_pos_feats=128,
temperature=10000,
normalize=True,
scale=None,
embed_type='sine',
num_embeddings=50):
super(PositionEmbedding, self).__init__()
assert embed_type in ['sine', 'learned']
self.embed_type = embed_type
if self.embed_type == 'sine':
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
elif self.embed_type == 'learned':
self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
else:
raise ValueError(f"not supported {self.embed_type}")
def forward(self, mask):
"""
Args:
mask (Tensor): [B, H, W]
Returns:
pos (Tensor): [B, C, H, W]
"""
assert mask.dtype == paddle.bool
if self.embed_type == 'sine':
mask = mask.astype('float32')
y_embed = mask.cumsum(1, dtype='float32')
x_embed = mask.cumsum(2, dtype='float32')
if self.normalize:
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = 2 * (paddle.arange(self.num_pos_feats) //
2).astype('float32')
dim_t = self.temperature**(dim_t / self.num_pos_feats)
pos_x = x_embed.unsqueeze(-1) / dim_t
pos_y = y_embed.unsqueeze(-1) / dim_t
pos_x = paddle.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
axis=4).flatten(3)
pos_y = paddle.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
axis=4).flatten(3)
pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
return pos
elif self.embed_type == 'learned':
h, w = mask.shape[-2:]
i = paddle.arange(w)
j = paddle.arange(h)
x_emb = self.col_embed(i)
y_emb = self.row_embed(j)
pos = paddle.concat(
[
x_emb.unsqueeze(0).repeat(h, 1, 1),
y_emb.unsqueeze(1).repeat(1, w, 1),
],
axis=-1).transpose([2, 0, 1]).unsqueeze(0).tile(mask.shape[0],
1, 1, 1)
return pos
else:
raise ValueError(f"not supported {self.embed_type}")
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ..bbox_utils import bbox_overlaps
__all__ = [
'_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy',
'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss'
]
def _get_clones(module, N):
return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
def bbox_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
return paddle.stack(b, axis=-1)
def bbox_xyxy_to_cxcywh(x):
x0, y0, x1, y1 = x.unbind(-1)
b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
return paddle.stack(b, axis=-1)
def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
prob = F.sigmoid(logit)
ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
p_t = prob * label + (1 - prob) * (1 - label)
loss = ce_loss * ((1 - p_t)**gamma)
if alpha >= 0:
alpha_t = alpha * label + (1 - alpha) * (1 - label)
loss = alpha_t * loss
return loss.mean(1).sum() / normalizer if normalizer > 1. else loss.mean(
1).sum()
......@@ -244,10 +244,11 @@ class OptimizerBuilder():
optim_args = self.optimizer.copy()
optim_type = optim_args['type']
del optim_args['type']
if optim_type != 'AdamW':
optim_args['weight_decay'] = regularization
op = getattr(optimizer, optim_type)
return op(learning_rate=learning_rate,
parameters=params,
weight_decay=regularization,
grad_clip=grad_clip,
**optim_args)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册