diff --git a/ppdet/data/transform/operators.py b/ppdet/data/transform/operators.py index 3d95dd1b88f856ed36471dcc313f3975670f8b4e..b729cb77a4f8d0d4c82bccdf27e65b539595e2d7 100644 --- a/ppdet/data/transform/operators.py +++ b/ppdet/data/transform/operators.py @@ -40,6 +40,7 @@ from PIL import Image, ImageEnhance, ImageDraw from ppdet.core.workspace import serializable from ppdet.modeling.layers import AnchorGrid from ppdet.modeling import bbox_utils +from ..reader import Compose from .op_helper import (satisfy_sample_constraint, filter_and_process, generate_sample_bbox, clip_bbox, data_anchor_sampling, @@ -2348,7 +2349,7 @@ class RandomResizeCrop(BaseOperator): for gt_segm in sample['gt_segm'] ] sample['gt_segm'] = np.asarray(masks).astype(np.uint8) - + return sample @@ -2528,3 +2529,354 @@ class Mosaic(BaseOperator): sample['difficult'] = difficult return sample + + +@register_op +class RandomSelect(BaseOperator): + """ + Randomly choose a transformation between transforms1 and transforms2, + and the probability of choosing transforms1 is p. + """ + + def __init__(self, transforms1, transforms2, p=0.5): + super(RandomSelect, self).__init__() + self.transforms1 = Compose(transforms1) + self.transforms2 = Compose(transforms2) + self.p = p + + def apply(self, sample, context=None): + if random.random() < self.p: + return self.transforms1(sample) + return self.transforms2(sample) + + +@register_op +class RandomShortSideResize(BaseOperator): + def __init__(self, + short_side_sizes, + max_size=None, + interp=cv2.INTER_LINEAR, + random_interp=False): + """ + Resize the image randomly according to the short side. If max_size is not None, + the long side is scaled according to max_size. The whole process will be keep ratio. + Args: + short_side_sizes (list|tuple): Image target short side size. + max_size (int): The size of the longest side of image after resize. + interp (int): The interpolation method. + random_interp (bool): Whether random select interpolation method. + """ + super(RandomShortSideResize, self).__init__() + + assert isinstance(short_side_sizes, + Sequence), "short_side_sizes must be List or Tuple" + + self.short_side_sizes = short_side_sizes + self.max_size = max_size + self.interp = interp + self.random_interp = random_interp + self.interps = [ + cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LANCZOS4, + ] + + def get_size_with_aspect_ratio(self, image_shape, size, max_size=None): + h, w = image_shape + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int( + round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (w, h) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (ow, oh) + + def resize(self, + sample, + target_size, + max_size=None, + interp=cv2.INTER_LINEAR): + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + if len(im.shape) != 3: + raise ImageError('{}: image is not 3-dimensional.'.format(self)) + + target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size, + max_size) + im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[ + 0] / im.shape[1] + + sample['image'] = cv2.resize(im, target_size, interpolation=interp) + sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32) + if 'scale_factor' in sample: + scale_factor = sample['scale_factor'] + sample['scale_factor'] = np.asarray( + [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], + dtype=np.float32) + else: + sample['scale_factor'] = np.asarray( + [im_scale_y, im_scale_x], dtype=np.float32) + + # apply bbox + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox( + sample['gt_bbox'], [im_scale_x, im_scale_y], target_size) + # apply polygon + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2], + [im_scale_x, im_scale_y]) + # apply semantic + if 'semantic' in sample and sample['semantic']: + semantic = sample['semantic'] + semantic = cv2.resize( + semantic.astype('float32'), + target_size, + interpolation=self.interp) + semantic = np.asarray(semantic).astype('int32') + semantic = np.expand_dims(semantic, 0) + sample['semantic'] = semantic + # apply gt_segm + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + masks = [ + cv2.resize( + gt_segm, target_size, interpolation=cv2.INTER_NEAREST) + for gt_segm in sample['gt_segm'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + return sample + + def apply_bbox(self, bbox, scale, size): + im_scale_x, im_scale_y = scale + resize_w, resize_h = size + bbox[:, 0::2] *= im_scale_x + bbox[:, 1::2] *= im_scale_y + bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w) + bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h) + return bbox.astype('float32') + + def apply_segm(self, segms, im_size, scale): + def _resize_poly(poly, im_scale_x, im_scale_y): + resized_poly = np.array(poly).astype('float32') + resized_poly[0::2] *= im_scale_x + resized_poly[1::2] *= im_scale_y + return resized_poly.tolist() + + def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, im_h, im_w) + + mask = mask_util.decode(rle) + mask = cv2.resize( + mask, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + im_h, im_w = im_size + im_scale_x, im_scale_y = scale + resized_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + resized_segms.append([ + _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm + ]) + else: + # RLE format + import pycocotools.mask as mask_util + resized_segms.append( + _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y)) + + return resized_segms + + def apply(self, sample, context=None): + target_size = random.choice(self.short_side_sizes) + interp = random.choice( + self.interps) if self.random_interp else self.interp + + return self.resize(sample, target_size, self.max_size, interp) + + +@register_op +class RandomSizeCrop(BaseOperator): + """ + Cut the image randomly according to `min_size` and `max_size` + """ + + def __init__(self, min_size, max_size): + super(RandomSizeCrop, self).__init__() + self.min_size = min_size + self.max_size = max_size + + from paddle.vision.transforms.functional import crop as paddle_crop + self.paddle_crop = paddle_crop + + @staticmethod + def get_crop_params(img_shape, output_size): + """Get parameters for ``crop`` for a random crop. + Args: + img_shape (list|tuple): Image's height and width. + output_size (list|tuple): Expected output size of the crop. + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. + """ + h, w = img_shape + th, tw = output_size + + if h + 1 < th or w + 1 < tw: + raise ValueError( + "Required crop size {} is larger then input image size {}". + format((th, tw), (h, w))) + + if w == tw and h == th: + return 0, 0, h, w + + i = random.randint(0, h - th + 1) + j = random.randint(0, w - tw + 1) + return i, j, th, tw + + def crop(self, sample, region): + image_shape = sample['image'].shape[:2] + sample['image'] = self.paddle_crop(sample['image'], *region) + + keep_index = None + # apply bbox + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], region) + bbox = sample['gt_bbox'].reshape([-1, 2, 2]) + area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1) + keep_index = np.where(area > 0)[0] + sample['gt_bbox'] = sample['gt_bbox'][keep_index] if len( + keep_index) > 0 else np.zeros( + [0, 4], dtype=np.float32) + sample['gt_class'] = sample['gt_class'][keep_index] if len( + keep_index) > 0 else np.zeros( + [0, 1], dtype=np.float32) + if 'gt_score' in sample: + sample['gt_score'] = sample['gt_score'][keep_index] if len( + keep_index) > 0 else np.zeros( + [0, 1], dtype=np.float32) + if 'is_crowd' in sample: + sample['is_crowd'] = sample['is_crowd'][keep_index] if len( + keep_index) > 0 else np.zeros( + [0, 1], dtype=np.float32) + + # apply polygon + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region, + image_shape) + if keep_index is not None: + sample['gt_poly'] = sample['gt_poly'][keep_index] + # apply gt_segm + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + i, j, h, w = region + sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w] + if keep_index is not None: + sample['gt_segm'] = sample['gt_segm'][keep_index] + + return sample + + def apply_bbox(self, bbox, region): + i, j, h, w = region + region_size = np.asarray([w, h]) + crop_bbox = bbox - np.asarray([j, i, j, i]) + crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size) + crop_bbox = crop_bbox.clip(min=0) + return crop_bbox.reshape([-1, 4]).astype('float32') + + def apply_segm(self, segms, region, image_shape): + def _crop_poly(segm, crop): + xmin, ymin, xmax, ymax = crop + crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin] + crop_p = np.array(crop_coord).reshape(4, 2) + crop_p = Polygon(crop_p) + + crop_segm = list() + for poly in segm: + poly = np.array(poly).reshape(len(poly) // 2, 2) + polygon = Polygon(poly) + if not polygon.is_valid: + exterior = polygon.exterior + multi_lines = exterior.intersection(exterior) + polygons = shapely.ops.polygonize(multi_lines) + polygon = MultiPolygon(polygons) + multi_polygon = list() + if isinstance(polygon, MultiPolygon): + multi_polygon = copy.deepcopy(polygon) + else: + multi_polygon.append(copy.deepcopy(polygon)) + for per_polygon in multi_polygon: + inter = per_polygon.intersection(crop_p) + if not inter: + continue + if isinstance(inter, (MultiPolygon, GeometryCollection)): + for part in inter: + if not isinstance(part, Polygon): + continue + part = np.squeeze( + np.array(part.exterior.coords[:-1]).reshape(1, + -1)) + part[0::2] -= xmin + part[1::2] -= ymin + crop_segm.append(part.tolist()) + elif isinstance(inter, Polygon): + crop_poly = np.squeeze( + np.array(inter.exterior.coords[:-1]).reshape(1, -1)) + crop_poly[0::2] -= xmin + crop_poly[1::2] -= ymin + crop_segm.append(crop_poly.tolist()) + else: + continue + return crop_segm + + def _crop_rle(rle, crop, height, width): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, height, width) + mask = mask_util.decode(rle) + mask = mask[crop[1]:crop[3], crop[0]:crop[2]] + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + i, j, h, w = region + crop = [j, i, j + w, i + h] + height, width = image_shape + crop_segms = [] + for segm in segms: + if is_poly(segm): + import copy + import shapely.ops + from shapely.geometry import Polygon, MultiPolygon, GeometryCollection + # Polygon format + crop_segms.append(_crop_poly(segm, crop)) + else: + # RLE format + import pycocotools.mask as mask_util + crop_segms.append(_crop_rle(segm, crop, height, width)) + return crop_segms + + def apply(self, sample, context=None): + h = random.randint(self.min_size, + min(sample['image'].shape[0], self.max_size)) + w = random.randint(self.min_size, + min(sample['image'].shape[1], self.max_size)) + + region = self.get_crop_params(sample['image'].shape[:2], [h, w]) + return self.crop(sample, region) diff --git a/ppdet/modeling/__init__.py b/ppdet/modeling/__init__.py index f8aed7d60edc8b0d6996dbf258ce877987c690f5..5e4c26120e548a8d90ab9d8a2cb7c7bd4a6deee2 100644 --- a/ppdet/modeling/__init__.py +++ b/ppdet/modeling/__init__.py @@ -27,6 +27,7 @@ from . import post_process from . import layers from . import reid from . import mot +from . import transformers from .ops import * from .backbones import * @@ -39,3 +40,4 @@ from .post_process import * from .layers import * from .reid import * from .mot import * +from .transformers import * diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py index 2efcd1d0eaae7824a3e81e2eca2d5601533e821d..80fd81a9d3c82c06a9ab5c66db513737dc73372e 100644 --- a/ppdet/modeling/architectures/__init__.py +++ b/ppdet/modeling/architectures/__init__.py @@ -21,6 +21,7 @@ from . import jde from . import deepsort from . import fairmot from . import centernet +from . import detr from .meta_arch import * from .faster_rcnn import * @@ -39,3 +40,4 @@ from .deepsort import * from .fairmot import * from .centernet import * from .blazeface import * +from .detr import * diff --git a/ppdet/modeling/architectures/detr.py b/ppdet/modeling/architectures/detr.py new file mode 100644 index 0000000000000000000000000000000000000000..2c081bf6cdb8f4b1e4e0eb55157c08979a63469a --- /dev/null +++ b/ppdet/modeling/architectures/detr.py @@ -0,0 +1,93 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from .meta_arch import BaseArch +from ppdet.core.workspace import register, create + +__all__ = ['DETR'] + + +@register +class DETR(BaseArch): + __category__ = 'architecture' + __inject__ = ['post_process'] + + def __init__(self, + backbone, + transformer, + detr_head, + post_process='DETRBBoxPostProcess'): + super(DETR, self).__init__() + self.backbone = backbone + self.transformer = transformer + self.detr_head = detr_head + self.post_process = post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + # backbone + backbone = create(cfg['backbone']) + # transformer + kwargs = {'input_shape': backbone.out_shape} + transformer = create(cfg['transformer'], **kwargs) + # head + kwargs = { + 'hidden_dim': transformer.hidden_dim, + 'nhead': transformer.nhead, + 'input_shape': backbone.out_shape + } + detr_head = create(cfg['detr_head'], **kwargs) + + return { + 'backbone': backbone, + 'transformer': transformer, + "detr_head": detr_head, + } + + def _forward(self): + # Backbone + body_feats = self.backbone(self.inputs) + + # Transformer + out_transformer = self.transformer(body_feats, self.inputs['pad_mask']) + + # DETR Head + if self.training: + return self.detr_head(out_transformer, body_feats, self.inputs) + else: + preds = self.detr_head(out_transformer, body_feats) + bbox, bbox_num = self.post_process(preds, self.inputs['im_shape'], + self.inputs['scale_factor']) + return bbox, bbox_num + + def get_loss(self, ): + losses = self._forward() + losses.update({ + 'loss': + paddle.add_n([v for k, v in losses.items() if 'log' not in k]) + }) + return losses + + def get_pred(self): + bbox_pred, bbox_num = self._forward() + output = { + "bbox": bbox_pred, + "bbox_num": bbox_num, + } + return output diff --git a/ppdet/modeling/heads/__init__.py b/ppdet/modeling/heads/__init__.py index 04be00e9eba6b5e65f229af88ba82a6c4f613dbc..040b040eb8caa8dab8ea00a6e7c63813cbdfbfa5 100644 --- a/ppdet/modeling/heads/__init__.py +++ b/ppdet/modeling/heads/__init__.py @@ -25,6 +25,7 @@ from . import face_head from . import s2anet_head from . import keypoint_hrhrnet_head from . import centernet_head +from . import detr_head from .bbox_head import * from .mask_head import * @@ -39,3 +40,4 @@ from .face_head import * from .s2anet_head import * from .keypoint_hrhrnet_head import * from .centernet_head import * +from .detr_head import * diff --git a/ppdet/modeling/heads/detr_head.py b/ppdet/modeling/heads/detr_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ec511654bd32939bb7e855fae396cefcfe9218e0 --- /dev/null +++ b/ppdet/modeling/heads/detr_head.py @@ -0,0 +1,278 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +import pycocotools.mask as mask_util +from ..initializer import * + +__all__ = ['DETRHead'] + + +class MLP(nn.Layer): + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.LayerList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + self._reset_parameters() + + def _reset_parameters(self): + for l in self.layers: + linear_init_(l) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +class MultiHeadAttentionMap(nn.Layer): + """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" + + def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, + bias=True): + super().__init__() + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.dropout = nn.Dropout(dropout) + + weight_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.XavierUniform()) + bias_attr = paddle.framework.ParamAttr( + initializer=paddle.nn.initializer.Constant()) if bias else False + + self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr) + self.k_proj = nn.Conv2D( + query_dim, + hidden_dim, + 1, + weight_attr=weight_attr, + bias_attr=bias_attr) + + self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5 + + def forward(self, q, k, mask=None): + q = self.q_proj(q) + k = self.k_proj(k) + bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\ + self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1] + qh = q.reshape([bs, num_queries, n, c]) + kh = k.reshape([bs, n, c, h, w]) + # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) + qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c]) + kh = kh.reshape([-1, c, h * w]) + weights = paddle.bmm(qh * self.normalize_fact, kh).reshape( + [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4]) + + if mask is not None: + weights += mask + # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247 + weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape) + weights = self.dropout(weights) + return weights + + +class MaskHeadFPNConv(nn.Layer): + """ + Simple convolutional head, using group norm. + Upsampling is done using a FPN approach + """ + + def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8): + super().__init__() + + inter_dims = [input_dim, + ] + [context_dim // (2**i) for i in range(1, 5)] + weight_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.KaimingUniform()) + bias_attr = paddle.framework.ParamAttr( + initializer=paddle.nn.initializer.Constant()) + + self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups, + weight_attr, bias_attr) + self.conv_inter = nn.LayerList() + for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]): + self.conv_inter.append( + self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr, + bias_attr)) + + self.conv_out = nn.Conv2D( + inter_dims[-1], + 1, + 3, + padding=1, + weight_attr=weight_attr, + bias_attr=bias_attr) + + self.adapter = nn.LayerList() + for i in range(len(fpn_dims)): + self.adapter.append( + nn.Conv2D( + fpn_dims[i], + inter_dims[i + 1], + 1, + weight_attr=weight_attr, + bias_attr=bias_attr)) + + def _make_layers(self, + in_dims, + out_dims, + kernel_size, + num_groups, + weight_attr=None, + bias_attr=None): + return nn.Sequential( + nn.Conv2D( + in_dims, + out_dims, + kernel_size, + padding=kernel_size // 2, + weight_attr=weight_attr, + bias_attr=bias_attr), + nn.GroupNorm(num_groups, out_dims), + nn.ReLU()) + + def forward(self, x, bbox_attention_map, fpns): + x = paddle.concat([ + x.tile([bbox_attention_map.shape[1], 1, 1, 1]), + bbox_attention_map.flatten(0, 1) + ], 1) + x = self.conv0(x) + for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1], + self.adapter, fpns): + feat = adapter_layer(feat).tile( + [bbox_attention_map.shape[1], 1, 1, 1]) + x = inter_layer(x) + x = feat + F.interpolate(x, size=feat.shape[-2:]) + + x = self.conv_inter[-1](x) + x = self.conv_out(x) + return x + + +@register +class DETRHead(nn.Layer): + __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss'] + __inject__ = ['loss'] + + def __init__(self, + num_classes=80, + hidden_dim=256, + nhead=8, + num_mlp_layers=3, + loss='DETRLoss', + fpn_dims=[1024, 512, 256], + with_mask_head=False, + use_focal_loss=False): + super(DETRHead, self).__init__() + # add background class + self.num_classes = num_classes if use_focal_loss else num_classes + 1 + self.hidden_dim = hidden_dim + self.loss = loss + self.with_mask_head = with_mask_head + self.use_focal_loss = use_focal_loss + + self.score_head = nn.Linear(hidden_dim, self.num_classes) + self.bbox_head = MLP(hidden_dim, + hidden_dim, + output_dim=4, + num_layers=num_mlp_layers) + if self.with_mask_head: + self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim, + nhead) + self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims, + hidden_dim) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.score_head) + + @classmethod + def from_config(cls, cfg, hidden_dim, nhead, input_shape): + + return { + 'hidden_dim': hidden_dim, + 'nhead': nhead, + 'fpn_dims': [i.channels for i in input_shape[::-1]][1:] + } + + @staticmethod + def get_gt_mask_from_polygons(gt_poly, pad_mask): + out_gt_mask = [] + for polygons, padding in zip(gt_poly, pad_mask): + height, width = int(padding[:, 0].sum()), int(padding[0, :].sum()) + masks = [] + for obj_poly in polygons: + rles = mask_util.frPyObjects(obj_poly, height, width) + rle = mask_util.merge(rles) + masks.append( + paddle.to_tensor(mask_util.decode(rle)).astype('float32')) + masks = paddle.stack(masks) + masks_pad = paddle.zeros( + [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]]) + masks_pad[:, :height, :width] = masks + out_gt_mask.append(masks_pad) + return out_gt_mask + + def forward(self, out_transformer, body_feats, inputs=None): + r""" + Args: + out_transformer (Tuple): (feats: [num_levels, batch_size, + num_queries, hidden_dim], + memory: [batch_size, hidden_dim, h, w], + src_proj: [batch_size, h*w, hidden_dim], + src_mask: [batch_size, 1, 1, h, w]) + body_feats (List(Tensor)): list[[B, C, H, W]] + inputs (dict): dict(inputs) + """ + feats, memory, src_proj, src_mask = out_transformer + outputs_logit = self.score_head(feats) + outputs_bbox = F.sigmoid(self.bbox_head(feats)) + outputs_seg = None + if self.with_mask_head: + bbox_attention_map = self.bbox_attention(feats[-1], memory, + src_mask) + fpn_feats = [a for a in body_feats[::-1]][1:] + outputs_seg = self.mask_head(src_proj, bbox_attention_map, + fpn_feats) + outputs_seg = outputs_seg.reshape([ + feats.shape[1], feats.shape[2], outputs_seg.shape[-2], + outputs_seg.shape[-1] + ]) + + if self.training: + assert inputs is not None + assert 'gt_bbox' in inputs and 'gt_class' in inputs + gt_mask = self.get_gt_mask_from_polygons( + inputs['gt_poly'], + inputs['pad_mask']) if 'gt_poly' in inputs else None + return self.loss( + outputs_bbox, + outputs_logit, + inputs['gt_bbox'], + inputs['gt_class'], + masks=outputs_seg, + gt_mask=gt_mask) + else: + return (outputs_bbox[-1], outputs_logit[-1], outputs_seg) diff --git a/ppdet/modeling/initializer.py b/ppdet/modeling/initializer.py index ce5dea95ab356014731edcd6c2b2d7c6207ab936..0e9aef403d9475e109dfa65373b2a55fd24b720c 100644 --- a/ppdet/modeling/initializer.py +++ b/ppdet/modeling/initializer.py @@ -28,6 +28,8 @@ __all__ = [ 'xavier_normal_', 'kaiming_uniform_', 'kaiming_normal_', + 'linear_init_', + 'conv_init_', 'reset_initialized_parameter', ] @@ -46,7 +48,7 @@ def _no_grad_normal_(tensor, mean=0., std=1.): return tensor -def _no_grad_fill_(tensor, value=0): +def _no_grad_fill_(tensor, value=0.): with paddle.no_grad(): v = paddle.rand(shape=tensor.shape, dtype=tensor.dtype) v[...] = value @@ -80,7 +82,7 @@ def normal_(tensor, mean=0., std=1.): return _no_grad_normal_(tensor, mean, std) -def constant_(tensor, value=0): +def constant_(tensor, value=0.): """ Modified tensor inspace using constant_ Args: @@ -150,7 +152,7 @@ def xavier_uniform_(tensor, gain=1., reverse=False): Modified tensor inspace using xavier_uniform_ Args: tensor (paddle.Tensor): paddle Tensor - gain (str): super parameter, 1. default. + gain (float): super parameter, 1. default. reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. Return: tensor @@ -166,7 +168,7 @@ def xavier_normal_(tensor, gain=1., reverse=False): Modified tensor inspace using xavier_normal_ Args: tensor (paddle.Tensor): paddle Tensor - gain (str): super parameter, 1. default. + gain (float): super parameter, 1. default. reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. Return: tensor @@ -260,6 +262,18 @@ def kaiming_normal_(tensor, return _no_grad_normal_(tensor, 0, std) +def linear_init_(module): + bound = 1 / math.sqrt(module.weight.shape[0]) + uniform_(module.weight, -bound, bound) + uniform_(module.bias, -bound, bound) + + +def conv_init_(module): + bound = 1 / math.sqrt(math.prod(module.weight.shape[1:])) + uniform_(module.weight, -bound, bound) + uniform_(module.bias, -bound, bound) + + @paddle.no_grad() def reset_initialized_parameter(model, include_self=True): """ diff --git a/ppdet/modeling/layers.py b/ppdet/modeling/layers.py index a9aea6167bc52b0545dadb02f2fcebdc898d745f..85057c860882f7a04f9f720f0a4cd724b016b1f0 100644 --- a/ppdet/modeling/layers.py +++ b/ppdet/modeling/layers.py @@ -29,8 +29,11 @@ from paddle.regularizer import L2Decay from ppdet.core.workspace import register, serializable from ppdet.modeling.bbox_utils import delta2bbox from . import ops +from .initializer import xavier_uniform_, constant_ from paddle.vision.ops import DeformConv2D +from paddle.nn.layer import transformer +_convert_attention_mask = transformer._convert_attention_mask def _to_list(l): @@ -1187,3 +1190,179 @@ class Concat(nn.Layer): def extra_repr(self): return 'dim={}'.format(self.dim) + + +class MultiHeadAttention(nn.Layer): + """ + Attention mapps queries and a set of key-value pairs to outputs, and + Multi-Head Attention performs multiple parallel attention to jointly attending + to information from different representation subspaces. + + Please refer to `Attention Is All You Need `_ + for more details. + + Parameters: + embed_dim (int): The expected feature size in the input and output. + num_heads (int): The number of heads in multi-head attention. + dropout (float, optional): The dropout probability used on attention + weights to drop some attention targets. 0 for no dropout. Default 0 + kdim (int, optional): The feature size in key. If None, assumed equal to + `embed_dim`. Default None. + vdim (int, optional): The feature size in value. If None, assumed equal to + `embed_dim`. Default None. + need_weights (bool, optional): Indicate whether to return the attention + weights. Default False. + + Examples: + + .. code-block:: python + + import paddle + + # encoder input: [batch_size, sequence_length, d_model] + query = paddle.rand((2, 4, 128)) + # self attention mask: [batch_size, num_heads, query_len, query_len] + attn_mask = paddle.rand((2, 2, 4, 4)) + multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) + output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] + """ + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + kdim=None, + vdim=None, + need_weights=False): + super(MultiHeadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.need_weights = need_weights + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if self._qkv_same_embed_dim: + self.in_proj_weight = self.create_parameter( + shape=[embed_dim, 3 * embed_dim], + attr=None, + dtype=self._dtype, + is_bias=False) + self.in_proj_bias = self.create_parameter( + shape=[3 * embed_dim], + attr=None, + dtype=self._dtype, + is_bias=True) + else: + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.k_proj = nn.Linear(self.kdim, embed_dim) + self.v_proj = nn.Linear(self.vdim, embed_dim) + + self.out_proj = nn.Linear(embed_dim, embed_dim) + self._type_list = ('q_proj', 'k_proj', 'v_proj') + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + else: + constant_(p) + + def compute_qkv(self, tensor, index): + if self._qkv_same_embed_dim: + tensor = F.linear( + x=tensor, + weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1) + * self.embed_dim], + bias=self.in_proj_bias[index * self.embed_dim:(index + 1) * + self.embed_dim] + if self.in_proj_bias is not None else None) + else: + tensor = getattr(self, self._type_list[index])(tensor) + tensor = tensor.reshape( + [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + return tensor + + def forward(self, query, key=None, value=None, attn_mask=None): + r""" + Applies multi-head attention to map queries and a set of key-value pairs + to outputs. + + Parameters: + query (Tensor): The queries for multi-head attention. It is a + tensor with shape `[batch_size, query_length, embed_dim]`. The + data type should be float32 or float64. + key (Tensor, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, key_length, kdim]`. The + data type should be float32 or float64. If None, use `query` as + `key`. Default None. + value (Tensor, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, value_length, vdim]`. + The data type should be float32 or float64. If None, use `query` as + `value`. Default None. + attn_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. + + Returns: + Tensor|tuple: It is a tensor that has the same shape and data type \ + as `query`, representing attention output. Or a tuple if \ + `need_weights` is True or `cache` is not None. If `need_weights` \ + is True, except for attention output, the tuple also includes \ + the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ + If `cache` is not None, the tuple then includes the new cache \ + having the same type as `cache`, and if it is `StaticCache`, it \ + is same as the input `cache`, if it is `Cache`, the new cache \ + reserves tensors concatanating raw tensors with intermediate \ + results of current query. + """ + key = query if key is None else key + value = query if value is None else value + # compute q ,k ,v + q, k, v = (self.compute_qkv(t, i) + for i, t in enumerate([query, key, value])) + + # scale dot product attention + product = paddle.matmul(x=q, y=k, transpose_y=True) + scaling = float(self.head_dim)**-0.5 + product = product * scaling + + if attn_mask is not None: + # Support bool or int mask + attn_mask = _convert_attention_mask(attn_mask, product.dtype) + product = product + attn_mask + weights = F.softmax(product) + if self.dropout: + weights = F.dropout( + weights, + self.dropout, + training=self.training, + mode="upscale_in_train") + + out = paddle.matmul(weights, v) + + # combine heads + out = paddle.transpose(out, perm=[0, 2, 1, 3]) + out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.out_proj(out) + + outs = [out] + if self.need_weights: + outs.append(weights) + return out if len(outs) == 1 else tuple(outs) diff --git a/ppdet/modeling/losses/__init__.py b/ppdet/modeling/losses/__init__.py index 7b7ecd63bd85b3b344865b20ed70f89ce60b30c0..ce75151354ba0d9aa12f006b9a6db7f048372554 100644 --- a/ppdet/modeling/losses/__init__.py +++ b/ppdet/modeling/losses/__init__.py @@ -22,6 +22,7 @@ from . import ctfocal_loss from . import keypoint_loss from . import jde_loss from . import fairmot_loss +from . import detr_loss from .yolo_loss import * from .iou_aware_loss import * @@ -33,3 +34,4 @@ from .ctfocal_loss import * from .keypoint_loss import * from .jde_loss import * from .fairmot_loss import * +from .detr_loss import * diff --git a/ppdet/modeling/losses/detr_loss.py b/ppdet/modeling/losses/detr_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..24eed2e78f8d5073384cc38672d64df67316c9ce --- /dev/null +++ b/ppdet/modeling/losses/detr_loss.py @@ -0,0 +1,230 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +from .iou_loss import GIoULoss +from ..transformers import bbox_cxcywh_to_xyxy, bbox_overlaps, sigmoid_focal_loss + +__all__ = ['DETRLoss'] + + +@register +class DETRLoss(nn.Layer): + __shared__ = ['num_classes', 'use_focal_loss'] + __inject__ = ['matcher'] + + def __init__(self, + num_classes=80, + matcher='HungarianMatcher', + loss_coeff={ + 'class': 1, + 'bbox': 5, + 'giou': 2, + 'no_object': 0.1, + 'mask': 1, + 'dice': 1 + }, + aux_loss=True, + use_focal_loss=False): + r""" + Args: + num_classes (int): The number of classes. + matcher (HungarianMatcher): It computes an assignment between the targets + and the predictions of the network. + loss_coeff (dict): The coefficient of loss. + aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used. + use_focal_loss (bool): Use focal loss or not. + """ + super(DETRLoss, self).__init__() + self.num_classes = num_classes + + self.matcher = matcher + self.loss_coeff = loss_coeff + self.aux_loss = aux_loss + self.use_focal_loss = use_focal_loss + + if not self.use_focal_loss: + self.loss_coeff['class'] = paddle.full([num_classes + 1], + loss_coeff['class']) + self.loss_coeff['class'][-1] = loss_coeff['no_object'] + self.giou_loss = GIoULoss() + + def _get_loss_class(self, logits, gt_class, match_indices, bg_index, + num_gts): + # logits: [b, query, num_classes], gt_class: list[[n, 1]] + target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64') + bs, num_query_objects = target_label.shape + if sum(len(a) for a in gt_class) > 0: + index, updates = self._get_index_updates(num_query_objects, + gt_class, match_indices) + target_label = paddle.scatter( + target_label.reshape([-1, 1]), index, updates.astype('int64')) + target_label = target_label.reshape([bs, num_query_objects]) + if self.use_focal_loss: + target_label = F.one_hot(target_label, + self.num_classes + 1)[:, :, :-1] + return { + 'loss_class': self.loss_coeff['class'] * sigmoid_focal_loss( + logits, target_label, num_gts / num_query_objects) + if self.use_focal_loss else F.cross_entropy( + logits, target_label, weight=self.loss_coeff['class']) + } + + def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts): + # boxes: [b, query, 4], gt_bbox: list[[n, 4]] + loss = dict() + if sum(len(a) for a in gt_bbox) == 0: + loss['loss_bbox'] = paddle.to_tensor([0.]) + loss['loss_giou'] = paddle.to_tensor([0.]) + return loss + + src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox, + match_indices) + loss['loss_bbox'] = self.loss_coeff['bbox'] * F.l1_loss( + src_bbox, target_bbox, reduction='sum') / num_gts + loss['loss_giou'] = self.giou_loss( + bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox)) + loss['loss_giou'] = loss['loss_giou'].sum() / num_gts + loss['loss_giou'] = self.loss_coeff['giou'] * loss['loss_giou'] + return loss + + def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts): + # masks: [b, query, h, w], gt_mask: list[[n, H, W]] + loss = dict() + if sum(len(a) for a in gt_mask) == 0: + loss['loss_mask'] = paddle.to_tensor([0.]) + loss['loss_dice'] = paddle.to_tensor([0.]) + return loss + + src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, + match_indices) + src_masks = F.interpolate( + src_masks.unsqueeze(0), + size=target_masks.shape[-2:], + mode="bilinear")[0] + loss['loss_mask'] = self.loss_coeff['mask'] * F.sigmoid_focal_loss( + src_masks, + target_masks, + paddle.to_tensor( + [num_gts], dtype='float32')) + loss['loss_dice'] = self.loss_coeff['dice'] * self._dice_loss( + src_masks, target_masks, num_gts) + return loss + + def _dice_loss(self, inputs, targets, num_gts): + inputs = F.sigmoid(inputs) + inputs = inputs.flatten(1) + targets = targets.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_gts + + def _get_loss_aux(self, boxes, logits, gt_bbox, gt_class, bg_index, + num_gts): + loss_class = [] + loss_bbox = [] + loss_giou = [] + for aux_boxes, aux_logits in zip(boxes, logits): + match_indices = self.matcher(aux_boxes, aux_logits, gt_bbox, + gt_class) + loss_class.append( + self._get_loss_class(aux_logits, gt_class, match_indices, + bg_index, num_gts)['loss_class']) + loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices, + num_gts) + loss_bbox.append(loss_['loss_bbox']) + loss_giou.append(loss_['loss_giou']) + loss = { + 'loss_class_aux': paddle.add_n(loss_class), + 'loss_bbox_aux': paddle.add_n(loss_bbox), + 'loss_giou_aux': paddle.add_n(loss_giou) + } + return loss + + def _get_index_updates(self, num_query_objects, target, match_indices): + batch_idx = paddle.concat([ + paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices) + ]) + src_idx = paddle.concat([src for (src, _) in match_indices]) + src_idx += (batch_idx * num_query_objects) + target_assign = paddle.concat([ + paddle.gather( + t, dst, axis=0) for t, (_, dst) in zip(target, match_indices) + ]) + return src_idx, target_assign + + def _get_src_target_assign(self, src, target, match_indices): + src_assign = paddle.concat([ + paddle.gather( + t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]]) + for t, (I, _) in zip(src, match_indices) + ]) + target_assign = paddle.concat([ + paddle.gather( + t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]]) + for t, (_, J) in zip(target, match_indices) + ]) + return src_assign, target_assign + + def forward(self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None): + r""" + Args: + boxes (Tensor): [l, b, query, 4] + logits (Tensor): [l, b, query, num_classes] + gt_bbox (List(Tensor)): list[[n, 4]] + gt_class (List(Tensor)): list[[n, 1]] + masks (Tensor, optional): [b, query, h, w] + gt_mask (List(Tensor), optional): list[[n, H, W]] + """ + match_indices = self.matcher(boxes[-1].detach(), logits[-1].detach(), + gt_bbox, gt_class) + num_gts = sum(len(a) for a in gt_bbox) + try: + # TODO: Paddle does not have a "paddle.distributed.is_initialized()" + num_gts = paddle.to_tensor([num_gts], dtype=paddle.float32) + paddle.distributed.all_reduce(num_gts) + num_gts = paddle.clip( + num_gts / paddle.distributed.get_world_size(), min=1).item() + except: + num_gts = max(num_gts, 1) + total_loss = dict() + total_loss.update( + self._get_loss_class(logits[-1], gt_class, match_indices, + self.num_classes, num_gts)) + total_loss.update( + self._get_loss_bbox(boxes[-1], gt_bbox, match_indices, num_gts)) + if masks is not None and gt_mask is not None: + total_loss.update( + self._get_loss_mask(masks, gt_mask, match_indices, num_gts)) + + if self.aux_loss: + total_loss.update( + self._get_loss_aux(boxes[:-1], logits[:-1], gt_bbox, gt_class, + self.num_classes, num_gts)) + + return total_loss diff --git a/ppdet/modeling/post_process.py b/ppdet/modeling/post_process.py index 9c917242d92ccaca431c4ce245b99ed0a259f1b7..712b3ddc1979677067850492f5adb8db6a5b09bf 100644 --- a/ppdet/modeling/post_process.py +++ b/ppdet/modeling/post_process.py @@ -19,18 +19,16 @@ import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.bbox_utils import nonempty_bbox, rbox2poly, rbox2poly from ppdet.modeling.layers import TTFBox +from .transformers import bbox_cxcywh_to_xyxy try: from collections.abc import Sequence except Exception: from collections import Sequence __all__ = [ - 'BBoxPostProcess', - 'MaskPostProcess', - 'FCOSPostProcess', - 'S2ANetBBoxPostProcess', - 'JDEBBoxPostProcess', - 'CenterNetPostProcess', + 'BBoxPostProcess', 'MaskPostProcess', 'FCOSPostProcess', + 'S2ANetBBoxPostProcess', 'JDEBBoxPostProcess', 'CenterNetPostProcess', + 'DETRBBoxPostProcess' ] @@ -492,3 +490,64 @@ class CenterNetPostProcess(TTFBox): else: results = paddle.concat([clses, scores, bboxes], axis=1) return results, paddle.shape(results)[0:1] + + +@register +class DETRBBoxPostProcess(object): + __shared__ = ['num_classes', 'use_focal_loss'] + __inject__ = [] + + def __init__(self, + num_classes=80, + num_top_queries=100, + use_focal_loss=False): + super(DETRBBoxPostProcess, self).__init__() + self.num_classes = num_classes + self.num_top_queries = num_top_queries + self.use_focal_loss = use_focal_loss + + def __call__(self, head_out, im_shape, scale_factor): + """ + Decode the bbox. + + Args: + head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output. + im_shape (Tensor): The shape of the input image. + scale_factor (Tensor): The scale factor of the input image. + Returns: + bbox_pred (Tensor): The output prediction with shape [N, 6], including + labels, scores and bboxes. The size of bboxes are corresponding + to the input image, the bboxes may be used in other branch. + bbox_num (Tensor): The number of prediction boxes of each batch with + shape [bs], and is N. + """ + bboxes, logits, masks = head_out + + bbox_pred = bbox_cxcywh_to_xyxy(bboxes) + origin_shape = paddle.floor(im_shape / scale_factor + 0.5) + img_h, img_w = origin_shape.unbind(1) + origin_shape = paddle.stack( + [img_w, img_h, img_w, img_h], axis=-1).unsqueeze(0) + bbox_pred *= origin_shape + + scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax( + logits)[:, :, :-1] + scores, labels = scores.max(-1), scores.argmax(-1) + + if scores.shape[1] > self.num_top_queries: + scores, index = paddle.topk(scores, self.num_top_queries, axis=-1) + labels = paddle.stack( + [paddle.gather(l, i) for l, i in zip(labels, index)]) + bbox_pred = paddle.stack( + [paddle.gather(b, i) for b, i in zip(bbox_pred, index)]) + + bbox_pred = paddle.concat( + [ + labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1), + bbox_pred + ], + axis=-1) + bbox_num = paddle.to_tensor( + bbox_pred.shape[1], dtype='int32').tile([bbox_pred.shape[0]]) + bbox_pred = bbox_pred.reshape([-1, 6]) + return bbox_pred, bbox_num diff --git a/ppdet/modeling/transformers/__init__.py b/ppdet/modeling/transformers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8bdcf4c268669d69f7a7fb59354e599fd9084580 --- /dev/null +++ b/ppdet/modeling/transformers/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import detr_transformer +from . import utils +from . import matchers +from . import position_encoding + +from .detr_transformer import * +from .utils import * +from .matchers import * +from .position_encoding import * diff --git a/ppdet/modeling/transformers/detr_transformer.py b/ppdet/modeling/transformers/detr_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..92d79d53c7c8b6edc273e461faa3d211b6c6fcbe --- /dev/null +++ b/ppdet/modeling/transformers/detr_transformer.py @@ -0,0 +1,351 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +from paddle.nn.layer.transformer import _convert_attention_mask +import paddle.nn.functional as F + +from ppdet.core.workspace import register +from ..layers import MultiHeadAttention +from .position_encoding import PositionEmbedding +from .utils import * +from ..initializer import * + +__all__ = ['DETRTransformer'] + + +class TransformerEncoderLayer(nn.Layer): + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False): + super(TransformerEncoderLayer, self).__init__() + attn_dropout = dropout if attn_dropout is None else attn_dropout + act_dropout = dropout if act_dropout is None else act_dropout + self.normalize_before = normalize_before + + self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") + self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") + self.activation = getattr(F, activation) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + + @staticmethod + def with_pos_embed(tensor, pos_embed): + return tensor if pos_embed is None else tensor + pos_embed + + def forward(self, src, src_mask=None, pos_embed=None): + src_mask = _convert_attention_mask(src_mask, src.dtype) + + residual = src + if self.normalize_before: + src = self.norm1(src) + q = k = self.with_pos_embed(src, pos_embed) + src = self.self_attn(q, k, value=src, attn_mask=src_mask) + + src = residual + self.dropout1(src) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src) + if not self.normalize_before: + src = self.norm2(src) + return src + + +class TransformerEncoder(nn.Layer): + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, src_mask=None, pos_embed=None): + src_mask = _convert_attention_mask(src_mask, src.dtype) + + output = src + for layer in self.layers: + output = layer(output, src_mask=src_mask, pos_embed=pos_embed) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoderLayer(nn.Layer): + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False): + super(TransformerDecoderLayer, self).__init__() + attn_dropout = dropout if attn_dropout is None else attn_dropout + act_dropout = dropout if act_dropout is None else act_dropout + self.normalize_before = normalize_before + + self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) + self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") + self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") + self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train") + self.activation = getattr(F, activation) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + + @staticmethod + def with_pos_embed(tensor, pos_embed): + return tensor if pos_embed is None else tensor + pos_embed + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + pos_embed=None, + query_pos_embed=None): + tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) + memory_mask = _convert_attention_mask(memory_mask, memory.dtype) + + residual = tgt + if self.normalize_before: + tgt = self.norm1(tgt) + q = k = self.with_pos_embed(tgt, query_pos_embed) + tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask) + tgt = residual + self.dropout1(tgt) + if not self.normalize_before: + tgt = self.norm1(tgt) + + residual = tgt + if self.normalize_before: + tgt = self.norm2(tgt) + q = self.with_pos_embed(tgt, query_pos_embed) + k = self.with_pos_embed(memory, pos_embed) + tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask) + tgt = residual + self.dropout2(tgt) + if not self.normalize_before: + tgt = self.norm2(tgt) + + residual = tgt + if self.normalize_before: + tgt = self.norm3(tgt) + tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = residual + self.dropout3(tgt) + if not self.normalize_before: + tgt = self.norm3(tgt) + return tgt + + +class TransformerDecoder(nn.Layer): + def __init__(self, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False): + super(TransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + pos_embed=None, + query_pos_embed=None): + tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) + memory_mask = _convert_attention_mask(memory_mask, memory.dtype) + + output = tgt + intermediate = [] + for layer in self.layers: + output = layer( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + pos_embed=pos_embed, + query_pos_embed=query_pos_embed) + if self.return_intermediate: + intermediate.append(self.norm(output)) + + if self.norm is not None: + output = self.norm(output) + + if self.return_intermediate: + return paddle.stack(intermediate) + + return output.unsqueeze(0) + + +@register +class DETRTransformer(nn.Layer): + __shared__ = ['hidden_dim'] + + def __init__(self, + num_queries=100, + position_embed_type='sine', + return_intermediate_dec=True, + backbone_num_channels=2048, + hidden_dim=256, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False): + super(DETRTransformer, self).__init__() + assert position_embed_type in ['sine', 'learned'],\ + f'ValueError: position_embed_type not supported {position_embed_type}!' + self.hidden_dim = hidden_dim + self.nhead = nhead + + encoder_layer = TransformerEncoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, + attn_dropout, act_dropout, normalize_before) + encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, + encoder_norm) + + decoder_layer = TransformerDecoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, + attn_dropout, act_dropout, normalize_before) + decoder_norm = nn.LayerNorm(hidden_dim) + self.decoder = TransformerDecoder( + decoder_layer, + num_decoder_layers, + decoder_norm, + return_intermediate=return_intermediate_dec) + + self.input_proj = nn.Conv2D( + backbone_num_channels, hidden_dim, kernel_size=1) + self.query_pos_embed = nn.Embedding(num_queries, hidden_dim) + self.position_embedding = PositionEmbedding( + hidden_dim // 2, + normalize=True if position_embed_type == 'sine' else False, + embed_type=position_embed_type) + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + conv_init_(self.input_proj) + normal_(self.query_pos_embed.weight) + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'backbone_num_channels': [i.channels for i in input_shape][-1], + } + + def forward(self, src, src_mask=None): + r""" + Applies a Transformer model on the inputs. + + Parameters: + src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]]. + src_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + [bs, H, W]`. When the data type is bool, the unwanted positions + have `False` values and the others have `True` values. When the + data type is int, the unwanted positions have 0 values and the + others have 1 values. When the data type is float, the unwanted + positions have `-INF` values and the others have 0 values. It + can be None when nothing wanted or needed to be prevented + attention to. Default None. + + Returns: + output (Tensor): [num_levels, batch_size, num_queries, hidden_dim] + memory (Tensor): [batch_size, hidden_dim, h, w] + """ + # use last level feature map + src_proj = self.input_proj(src[-1]) + bs, c, h, w = src_proj.shape + # flatten [B, C, H, W] to [B, HxW, C] + src_flatten = src_proj.flatten(2).transpose([0, 2, 1]) + if src_mask is not None: + src_mask = F.interpolate( + src_mask.unsqueeze(0).astype(src_flatten.dtype), + size=(h, w))[0].astype('bool') + else: + src_mask = paddle.ones([bs, h, w], dtype='bool') + pos_embed = self.position_embedding(src_mask).flatten(2).transpose( + [0, 2, 1]) + + src_mask = _convert_attention_mask(src_mask, src_flatten.dtype) + src_mask = src_mask.reshape([bs, 1, 1, -1]) + + memory = self.encoder( + src_flatten, src_mask=src_mask, pos_embed=pos_embed) + + query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile( + [bs, 1, 1]) + tgt = paddle.zeros_like(query_pos_embed) + output = self.decoder( + tgt, + memory, + memory_mask=src_mask, + pos_embed=pos_embed, + query_pos_embed=query_pos_embed) + + return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]), + src_proj, src_mask.reshape([bs, 1, 1, h, w])) diff --git a/ppdet/modeling/transformers/matchers.py b/ppdet/modeling/transformers/matchers.py new file mode 100644 index 0000000000000000000000000000000000000000..6e606ef98c1a34186707be8d318393427e7a9f56 --- /dev/null +++ b/ppdet/modeling/transformers/matchers.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from scipy.optimize import linear_sum_assignment + +from ppdet.core.workspace import register, serializable +from ..losses.iou_loss import GIoULoss +from .utils import bbox_cxcywh_to_xyxy + +__all__ = ['HungarianMatcher'] + + +@register +@serializable +class HungarianMatcher(nn.Layer): + __shared__ = ['use_focal_loss'] + + def __init__(self, + matcher_coeff={'class': 1, + 'bbox': 5, + 'giou': 2}, + use_focal_loss=False, + alpha=0.25, + gamma=2.0): + r""" + Args: + matcher_coeff (dict): The coefficient of hungarian matcher cost. + """ + super(HungarianMatcher, self).__init__() + self.matcher_coeff = matcher_coeff + self.use_focal_loss = use_focal_loss + self.alpha = alpha + self.gamma = gamma + + self.giou_loss = GIoULoss() + + def forward(self, boxes, logits, gt_bbox, gt_class): + r""" + Args: + boxes (Tensor): [b, query, 4] + logits (Tensor): [b, query, num_classes] + gt_bbox (List(Tensor)): list[[n, 4]] + gt_class (List(Tensor)): list[[n, 1]] + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + bs, num_queries = boxes.shape[:2] + + num_gts = sum(len(a) for a in gt_class) + if num_gts == 0: + return [(paddle.to_tensor( + [], dtype=paddle.int64), paddle.to_tensor( + [], dtype=paddle.int64)) for _ in range(bs)] + + # We flatten to compute the cost matrices in a batch + # [batch_size * num_queries, num_classes] + out_prob = F.sigmoid(logits.flatten( + 0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1)) + # [batch_size * num_queries, 4] + out_bbox = boxes.flatten(0, 1) + + # Also concat the target labels and boxes + tgt_ids = paddle.concat(gt_class).flatten() + tgt_bbox = paddle.concat(gt_bbox) + + # Compute the classification cost + if self.use_focal_loss: + neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-( + 1 - out_prob + 1e-8).log()) + pos_cost_class = self.alpha * ( + (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log()) + cost_class = paddle.gather( + pos_cost_class, tgt_ids, axis=1) - paddle.gather( + neg_cost_class, tgt_ids, axis=1) + else: + cost_class = -paddle.gather(out_prob, tgt_ids, axis=1) + + # Compute the L1 cost between boxes + cost_bbox = ( + out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1) + + # Compute the giou cost betwen boxes + cost_giou = self.giou_loss( + bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)), + bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1) + + # Final cost matrix + C = self.matcher_coeff['class'] * cost_class + self.matcher_coeff['bbox'] * cost_bbox + \ + self.matcher_coeff['giou'] * cost_giou + C = C.reshape([bs, num_queries, -1]) + C = [a.squeeze(0) for a in C.chunk(bs)] + + sizes = [a.shape[0] for a in gt_bbox] + indices = [ + linear_sum_assignment(c.split(sizes, -1)[i].numpy()) + for i, c in enumerate(C) + ] + return [(paddle.to_tensor( + i, dtype=paddle.int64), paddle.to_tensor( + j, dtype=paddle.int64)) for i, j in indices] diff --git a/ppdet/modeling/transformers/position_encoding.py b/ppdet/modeling/transformers/position_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..2644d36e15607324a1d4b91a2d87b30a6a1a15b6 --- /dev/null +++ b/ppdet/modeling/transformers/position_encoding.py @@ -0,0 +1,101 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn + +from ppdet.core.workspace import register, serializable + + +@register +@serializable +class PositionEmbedding(nn.Layer): + def __init__(self, + num_pos_feats=128, + temperature=10000, + normalize=True, + scale=None, + embed_type='sine', + num_embeddings=50): + super(PositionEmbedding, self).__init__() + assert embed_type in ['sine', 'learned'] + + self.embed_type = embed_type + if self.embed_type == 'sine': + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + elif self.embed_type == 'learned': + self.row_embed = nn.Embedding(num_embeddings, num_pos_feats) + self.col_embed = nn.Embedding(num_embeddings, num_pos_feats) + else: + raise ValueError(f"not supported {self.embed_type}") + + def forward(self, mask): + """ + Args: + mask (Tensor): [B, H, W] + Returns: + pos (Tensor): [B, C, H, W] + """ + assert mask.dtype == paddle.bool + if self.embed_type == 'sine': + mask = mask.astype('float32') + y_embed = mask.cumsum(1, dtype='float32') + x_embed = mask.cumsum(2, dtype='float32') + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = 2 * (paddle.arange(self.num_pos_feats) // + 2).astype('float32') + dim_t = self.temperature**(dim_t / self.num_pos_feats) + + pos_x = x_embed.unsqueeze(-1) / dim_t + pos_y = y_embed.unsqueeze(-1) / dim_t + pos_x = paddle.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), + axis=4).flatten(3) + pos_y = paddle.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), + axis=4).flatten(3) + pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2]) + return pos + elif self.embed_type == 'learned': + h, w = mask.shape[-2:] + i = paddle.arange(w) + j = paddle.arange(h) + x_emb = self.col_embed(i) + y_emb = self.row_embed(j) + pos = paddle.concat( + [ + x_emb.unsqueeze(0).repeat(h, 1, 1), + y_emb.unsqueeze(1).repeat(1, w, 1), + ], + axis=-1).transpose([2, 0, 1]).unsqueeze(0).tile(mask.shape[0], + 1, 1, 1) + return pos + else: + raise ValueError(f"not supported {self.embed_type}") diff --git a/ppdet/modeling/transformers/utils.py b/ppdet/modeling/transformers/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d8abad9fbd71166b5324e368f5c7d8f8a2fd3ecd --- /dev/null +++ b/ppdet/modeling/transformers/utils.py @@ -0,0 +1,58 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ..bbox_utils import bbox_overlaps + +__all__ = [ + '_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy', + 'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss' +] + + +def _get_clones(module, N): + return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) + + +def bbox_cxcywh_to_xyxy(x): + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] + return paddle.stack(b, axis=-1) + + +def bbox_xyxy_to_cxcywh(x): + x0, y0, x1, y1 = x.unbind(-1) + b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] + return paddle.stack(b, axis=-1) + + +def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0): + prob = F.sigmoid(logit) + ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none") + p_t = prob * label + (1 - prob) * (1 - label) + loss = ce_loss * ((1 - p_t)**gamma) + + if alpha >= 0: + alpha_t = alpha * label + (1 - alpha) * (1 - label) + loss = alpha_t * loss + return loss.mean(1).sum() / normalizer if normalizer > 1. else loss.mean( + 1).sum() diff --git a/ppdet/optimizer.py b/ppdet/optimizer.py index 6b0926488662609a278d7f6948488962475a42a3..034be246002f6d4d35cea7f8409645f2e0a1e5ab 100644 --- a/ppdet/optimizer.py +++ b/ppdet/optimizer.py @@ -244,10 +244,11 @@ class OptimizerBuilder(): optim_args = self.optimizer.copy() optim_type = optim_args['type'] del optim_args['type'] + if optim_type != 'AdamW': + optim_args['weight_decay'] = regularization op = getattr(optimizer, optim_type) return op(learning_rate=learning_rate, parameters=params, - weight_decay=regularization, grad_clip=grad_clip, **optim_args)