[Transformer] Add transformer base code (#3612)

* Add DETR * drop return_pad_mask in PadBatch

[Transformer] Add transformer base code (#3612)
* Add DETR * drop return_pad_mask in PadBatch
3c1c576d · shangliang Xu · GitHub · 43515234 · 3c1c576d · 3c1c576d
17 changed file
--- a/ppdet/data/transform/operators.py
+++ b/ppdet/data/transform/operators.py
@@ -40,6 +40,7 @@ from PIL import Image, ImageEnhance, ImageDraw
 from ppdet.core.workspace import serializable
 from ppdet.modeling.layers import AnchorGrid
 from ppdet.modeling import bbox_utils
+from ..reader import Compose

 from .op_helper import (satisfy_sample_constraint, filter_and_process,
                        generate_sample_bbox, clip_bbox, data_anchor_sampling,
@@ -2348,7 +2349,7 @@ class RandomResizeCrop(BaseOperator):
                for gt_segm in sample['gt_segm']
            ]
            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
-        
+
        return sample


@@ -2528,3 +2529,354 @@ class Mosaic(BaseOperator):
            sample['difficult'] = difficult

        return sample
+
+
+@register_op
+class RandomSelect(BaseOperator):
+    """
+    Randomly choose a transformation between transforms1 and transforms2,
+    and the probability of choosing transforms1 is p.
+    """
+
+    def __init__(self, transforms1, transforms2, p=0.5):
+        super(RandomSelect, self).__init__()
+        self.transforms1 = Compose(transforms1)
+        self.transforms2 = Compose(transforms2)
+        self.p = p
+
+    def apply(self, sample, context=None):
+        if random.random() < self.p:
+            return self.transforms1(sample)
+        return self.transforms2(sample)
+
+
+@register_op
+class RandomShortSideResize(BaseOperator):
+    def __init__(self,
+                 short_side_sizes,
+                 max_size=None,
+                 interp=cv2.INTER_LINEAR,
+                 random_interp=False):
+        """
+        Resize the image randomly according to the short side. If max_size is not None,
+        the long side is scaled according to max_size. The whole process will be keep ratio.
+        Args:
+            short_side_sizes (list|tuple): Image target short side size.
+            max_size (int): The size of the longest side of image after resize.
+            interp (int): The interpolation method.
+            random_interp (bool): Whether random select interpolation method.
+        """
+        super(RandomShortSideResize, self).__init__()
+
+        assert isinstance(short_side_sizes,
+                          Sequence), "short_side_sizes must be List or Tuple"
+
+        self.short_side_sizes = short_side_sizes
+        self.max_size = max_size
+        self.interp = interp
+        self.random_interp = random_interp
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+
+    def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
+        h, w = image_shape
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(
+                    round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (w, h)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (ow, oh)
+
+    def resize(self,
+               sample,
+               target_size,
+               max_size=None,
+               interp=cv2.INTER_LINEAR):
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,
+                                                      max_size)
+        im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
+            0] / im.shape[1]
+
+        sample['image'] = cv2.resize(im, target_size, interpolation=interp)
+        sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(
+                sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],
+                                                [im_scale_x, im_scale_y])
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                target_size,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+        return sample
+
+    def apply_bbox(self, bbox, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        bbox[:, 0::2] *= im_scale_x
+        bbox[:, 1::2] *= im_scale_y
+        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
+        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
+        return bbox.astype('float32')
+
+    def apply_segm(self, segms, im_size, scale):
+        def _resize_poly(poly, im_scale_x, im_scale_y):
+            resized_poly = np.array(poly).astype('float32')
+            resized_poly[0::2] *= im_scale_x
+            resized_poly[1::2] *= im_scale_y
+            return resized_poly.tolist()
+
+        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, im_h, im_w)
+
+            mask = mask_util.decode(rle)
+            mask = cv2.resize(
+                mask,
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        im_h, im_w = im_size
+        im_scale_x, im_scale_y = scale
+        resized_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                resized_segms.append([
+                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
+                ])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                resized_segms.append(
+                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
+
+        return resized_segms
+
+    def apply(self, sample, context=None):
+        target_size = random.choice(self.short_side_sizes)
+        interp = random.choice(
+            self.interps) if self.random_interp else self.interp
+
+        return self.resize(sample, target_size, self.max_size, interp)
+
+
+@register_op
+class RandomSizeCrop(BaseOperator):
+    """
+    Cut the image randomly according to `min_size` and `max_size`
+    """
+
+    def __init__(self, min_size, max_size):
+        super(RandomSizeCrop, self).__init__()
+        self.min_size = min_size
+        self.max_size = max_size
+
+        from paddle.vision.transforms.functional import crop as paddle_crop
+        self.paddle_crop = paddle_crop
+
+    @staticmethod
+    def get_crop_params(img_shape, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img_shape (list|tuple): Image's height and width.
+            output_size (list|tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        h, w = img_shape
+        th, tw = output_size
+
+        if h + 1 < th or w + 1 < tw:
+            raise ValueError(
+                "Required crop size {} is larger then input image size {}".
+                format((th, tw), (h, w)))
+
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = random.randint(0, h - th + 1)
+        j = random.randint(0, w - tw + 1)
+        return i, j, th, tw
+
+    def crop(self, sample, region):
+        image_shape = sample['image'].shape[:2]
+        sample['image'] = self.paddle_crop(sample['image'], *region)
+
+        keep_index = None
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], region)
+            bbox = sample['gt_bbox'].reshape([-1, 2, 2])
+            area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
+            keep_index = np.where(area > 0)[0]
+            sample['gt_bbox'] = sample['gt_bbox'][keep_index] if len(
+                keep_index) > 0 else np.zeros(
+                    [0, 4], dtype=np.float32)
+            sample['gt_class'] = sample['gt_class'][keep_index] if len(
+                keep_index) > 0 else np.zeros(
+                    [0, 1], dtype=np.float32)
+            if 'gt_score' in sample:
+                sample['gt_score'] = sample['gt_score'][keep_index] if len(
+                    keep_index) > 0 else np.zeros(
+                        [0, 1], dtype=np.float32)
+            if 'is_crowd' in sample:
+                sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
+                    keep_index) > 0 else np.zeros(
+                        [0, 1], dtype=np.float32)
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
+                                                image_shape)
+            if keep_index is not None:
+                sample['gt_poly'] = sample['gt_poly'][keep_index]
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            i, j, h, w = region
+            sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
+            if keep_index is not None:
+                sample['gt_segm'] = sample['gt_segm'][keep_index]
+
+        return sample
+
+    def apply_bbox(self, bbox, region):
+        i, j, h, w = region
+        region_size = np.asarray([w, h])
+        crop_bbox = bbox - np.asarray([j, i, j, i])
+        crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
+        crop_bbox = crop_bbox.clip(min=0)
+        return crop_bbox.reshape([-1, 4]).astype('float32')
+
+    def apply_segm(self, segms, region, image_shape):
+        def _crop_poly(segm, crop):
+            xmin, ymin, xmax, ymax = crop
+            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
+            crop_p = np.array(crop_coord).reshape(4, 2)
+            crop_p = Polygon(crop_p)
+
+            crop_segm = list()
+            for poly in segm:
+                poly = np.array(poly).reshape(len(poly) // 2, 2)
+                polygon = Polygon(poly)
+                if not polygon.is_valid:
+                    exterior = polygon.exterior
+                    multi_lines = exterior.intersection(exterior)
+                    polygons = shapely.ops.polygonize(multi_lines)
+                    polygon = MultiPolygon(polygons)
+                multi_polygon = list()
+                if isinstance(polygon, MultiPolygon):
+                    multi_polygon = copy.deepcopy(polygon)
+                else:
+                    multi_polygon.append(copy.deepcopy(polygon))
+                for per_polygon in multi_polygon:
+                    inter = per_polygon.intersection(crop_p)
+                    if not inter:
+                        continue
+                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
+                        for part in inter:
+                            if not isinstance(part, Polygon):
+                                continue
+                            part = np.squeeze(
+                                np.array(part.exterior.coords[:-1]).reshape(1,
+                                                                            -1))
+                            part[0::2] -= xmin
+                            part[1::2] -= ymin
+                            crop_segm.append(part.tolist())
+                    elif isinstance(inter, Polygon):
+                        crop_poly = np.squeeze(
+                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
+                        crop_poly[0::2] -= xmin
+                        crop_poly[1::2] -= ymin
+                        crop_segm.append(crop_poly.tolist())
+                    else:
+                        continue
+            return crop_segm
+
+        def _crop_rle(rle, crop, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        i, j, h, w = region
+        crop = [j, i, j + w, i + h]
+        height, width = image_shape
+        crop_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                import copy
+                import shapely.ops
+                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
+                # Polygon format
+                crop_segms.append(_crop_poly(segm, crop))
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                crop_segms.append(_crop_rle(segm, crop, height, width))
+        return crop_segms
+
+    def apply(self, sample, context=None):
+        h = random.randint(self.min_size,
+                           min(sample['image'].shape[0], self.max_size))
+        w = random.randint(self.min_size,
+                           min(sample['image'].shape[1], self.max_size))
+
+        region = self.get_crop_params(sample['image'].shape[:2], [h, w])
+        return self.crop(sample, region)
--- a/ppdet/modeling/__init__.py
+++ b/ppdet/modeling/__init__.py
@@ -27,6 +27,7 @@ from . import post_process
 from . import layers
 from . import reid
 from . import mot
+from . import transformers

 from .ops import *
 from .backbones import *
@@ -39,3 +40,4 @@ from .post_process import *
 from .layers import *
 from .reid import *
 from .mot import *
+from .transformers import *
--- a/ppdet/modeling/architectures/__init__.py
+++ b/ppdet/modeling/architectures/__init__.py
@@ -21,6 +21,7 @@ from . import jde
 from . import deepsort
 from . import fairmot
 from . import centernet
+from . import detr

 from .meta_arch import *
 from .faster_rcnn import *
@@ -39,3 +40,4 @@ from .deepsort import *
 from .fairmot import *
 from .centernet import *
 from .blazeface import *
+from .detr import *
--- a/ppdet/modeling/architectures/detr.py
+++ b/ppdet/modeling/architectures/detr.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .meta_arch import BaseArch
+from ppdet.core.workspace import register, create
+
+__all__ = ['DETR']
+
+
+@register
+class DETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone,
+                 transformer,
+                 detr_head,
+                 post_process='DETRBBoxPostProcess'):
+        super(DETR, self).__init__()
+        self.backbone = backbone
+        self.transformer = transformer
+        self.detr_head = detr_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # transformer
+        kwargs = {'input_shape': backbone.out_shape}
+        transformer = create(cfg['transformer'], **kwargs)
+        # head
+        kwargs = {
+            'hidden_dim': transformer.hidden_dim,
+            'nhead': transformer.nhead,
+            'input_shape': backbone.out_shape
+        }
+        detr_head = create(cfg['detr_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'transformer': transformer,
+            "detr_head": detr_head,
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+
+        # Transformer
+        out_transformer = self.transformer(body_feats, self.inputs['pad_mask'])
+
+        # DETR Head
+        if self.training:
+            return self.detr_head(out_transformer, body_feats, self.inputs)
+        else:
+            preds = self.detr_head(out_transformer, body_feats)
+            bbox, bbox_num = self.post_process(preds, self.inputs['im_shape'],
+                                               self.inputs['scale_factor'])
+            return bbox, bbox_num
+
+    def get_loss(self, ):
+        losses = self._forward()
+        losses.update({
+            'loss':
+            paddle.add_n([v for k, v in losses.items() if 'log' not in k])
+        })
+        return losses
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {
+            "bbox": bbox_pred,
+            "bbox_num": bbox_num,
+        }
+        return output
--- a/ppdet/modeling/heads/__init__.py
+++ b/ppdet/modeling/heads/__init__.py
@@ -25,6 +25,7 @@ from . import face_head
 from . import s2anet_head
 from . import keypoint_hrhrnet_head
 from . import centernet_head
+from . import detr_head

 from .bbox_head import *
 from .mask_head import *
@@ -39,3 +40,4 @@ from .face_head import *
 from .s2anet_head import *
 from .keypoint_hrhrnet_head import *
 from .centernet_head import *
+from .detr_head import *
--- a/ppdet/modeling/heads/detr_head.py
+++ b/ppdet/modeling/heads/detr_head.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+import pycocotools.mask as mask_util
+from ..initializer import *
+
+__all__ = ['DETRHead']
+
+
+class MLP(nn.Layer):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.LayerList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for l in self.layers:
+            linear_init_(l)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MultiHeadAttentionMap(nn.Layer):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
+                 bias=True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant()) if bias else False
+
+        self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
+        self.k_proj = nn.Conv2D(
+            query_dim,
+            hidden_dim,
+            1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
+
+    def forward(self, q, k, mask=None):
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
+                                      self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
+        qh = q.reshape([bs, num_queries, n, c])
+        kh = k.reshape([bs, n, c, h, w])
+        # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
+        qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
+        kh = kh.reshape([-1, c, h * w])
+        weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
+            [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
+
+        if mask is not None:
+            weights += mask
+        # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
+        weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
+        weights = self.dropout(weights)
+        return weights
+
+
+class MaskHeadFPNConv(nn.Layer):
+    """
+    Simple convolutional head, using group norm.
+    Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
+        super().__init__()
+
+        inter_dims = [input_dim,
+                      ] + [context_dim // (2**i) for i in range(1, 5)]
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform())
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant())
+
+        self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
+                                       weight_attr, bias_attr)
+        self.conv_inter = nn.LayerList()
+        for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
+            self.conv_inter.append(
+                self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
+                                  bias_attr))
+
+        self.conv_out = nn.Conv2D(
+            inter_dims[-1],
+            1,
+            3,
+            padding=1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+        self.adapter = nn.LayerList()
+        for i in range(len(fpn_dims)):
+            self.adapter.append(
+                nn.Conv2D(
+                    fpn_dims[i],
+                    inter_dims[i + 1],
+                    1,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr))
+
+    def _make_layers(self,
+                     in_dims,
+                     out_dims,
+                     kernel_size,
+                     num_groups,
+                     weight_attr=None,
+                     bias_attr=None):
+        return nn.Sequential(
+            nn.Conv2D(
+                in_dims,
+                out_dims,
+                kernel_size,
+                padding=kernel_size // 2,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr),
+            nn.GroupNorm(num_groups, out_dims),
+            nn.ReLU())
+
+    def forward(self, x, bbox_attention_map, fpns):
+        x = paddle.concat([
+            x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
+            bbox_attention_map.flatten(0, 1)
+        ], 1)
+        x = self.conv0(x)
+        for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
+                                                    self.adapter, fpns):
+            feat = adapter_layer(feat).tile(
+                [bbox_attention_map.shape[1], 1, 1, 1])
+            x = inter_layer(x)
+            x = feat + F.interpolate(x, size=feat.shape[-2:])
+
+        x = self.conv_inter[-1](x)
+        x = self.conv_out(x)
+        return x
+
+
+@register
+class DETRHead(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_mlp_layers=3,
+                 loss='DETRLoss',
+                 fpn_dims=[1024, 512, 256],
+                 with_mask_head=False,
+                 use_focal_loss=False):
+        super(DETRHead, self).__init__()
+        # add background class
+        self.num_classes = num_classes if use_focal_loss else num_classes + 1
+        self.hidden_dim = hidden_dim
+        self.loss = loss
+        self.with_mask_head = with_mask_head
+        self.use_focal_loss = use_focal_loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(hidden_dim,
+                             hidden_dim,
+                             output_dim=4,
+                             num_layers=num_mlp_layers)
+        if self.with_mask_head:
+            self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
+                                                        nhead)
+            self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
+                                             hidden_dim)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+
+        return {
+            'hidden_dim': hidden_dim,
+            'nhead': nhead,
+            'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
+        }
+
+    @staticmethod
+    def get_gt_mask_from_polygons(gt_poly, pad_mask):
+        out_gt_mask = []
+        for polygons, padding in zip(gt_poly, pad_mask):
+            height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
+            masks = []
+            for obj_poly in polygons:
+                rles = mask_util.frPyObjects(obj_poly, height, width)
+                rle = mask_util.merge(rles)
+                masks.append(
+                    paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
+            masks = paddle.stack(masks)
+            masks_pad = paddle.zeros(
+                [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
+            masks_pad[:, :height, :width] = masks
+            out_gt_mask.append(masks_pad)
+        return out_gt_mask
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size, hidden_dim, h, w],
+                            src_proj: [batch_size, h*w, hidden_dim],
+                            src_mask: [batch_size, 1, 1, h, w])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, src_proj, src_mask = out_transformer
+        outputs_logit = self.score_head(feats)
+        outputs_bbox = F.sigmoid(self.bbox_head(feats))
+        outputs_seg = None
+        if self.with_mask_head:
+            bbox_attention_map = self.bbox_attention(feats[-1], memory,
+                                                     src_mask)
+            fpn_feats = [a for a in body_feats[::-1]][1:]
+            outputs_seg = self.mask_head(src_proj, bbox_attention_map,
+                                         fpn_feats)
+            outputs_seg = outputs_seg.reshape([
+                feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
+                outputs_seg.shape[-1]
+            ])
+
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+            gt_mask = self.get_gt_mask_from_polygons(
+                inputs['gt_poly'],
+                inputs['pad_mask']) if 'gt_poly' in inputs else None
+            return self.loss(
+                outputs_bbox,
+                outputs_logit,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                masks=outputs_seg,
+                gt_mask=gt_mask)
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
--- a/ppdet/modeling/initializer.py
+++ b/ppdet/modeling/initializer.py
@@ -28,6 +28,8 @@ __all__ = [
    'xavier_normal_',
    'kaiming_uniform_',
    'kaiming_normal_',
+    'linear_init_',
+    'conv_init_',
    'reset_initialized_parameter',
 ]

@@ -46,7 +48,7 @@ def _no_grad_normal_(tensor, mean=0., std=1.):
    return tensor


-def _no_grad_fill_(tensor, value=0):
+def _no_grad_fill_(tensor, value=0.):
    with paddle.no_grad():
        v = paddle.rand(shape=tensor.shape, dtype=tensor.dtype)
        v[...] = value
@@ -80,7 +82,7 @@ def normal_(tensor, mean=0., std=1.):
    return _no_grad_normal_(tensor, mean, std)


-def constant_(tensor, value=0):
+def constant_(tensor, value=0.):
    """
    Modified tensor inspace using constant_
    Args:
@@ -150,7 +152,7 @@ def xavier_uniform_(tensor, gain=1., reverse=False):
    Modified tensor inspace using xavier_uniform_
    Args:
        tensor (paddle.Tensor): paddle Tensor
-        gain (str): super parameter, 1. default.
+        gain (float): super parameter, 1. default.
        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
    Return:
        tensor
@@ -166,7 +168,7 @@ def xavier_normal_(tensor, gain=1., reverse=False):
    Modified tensor inspace using xavier_normal_
    Args:
        tensor (paddle.Tensor): paddle Tensor
-        gain (str): super parameter, 1. default.
+        gain (float): super parameter, 1. default.
        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
    Return:
        tensor
@@ -260,6 +262,18 @@ def kaiming_normal_(tensor,
    return _no_grad_normal_(tensor, 0, std)


+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / math.sqrt(math.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
 @paddle.no_grad()
 def reset_initialized_parameter(model, include_self=True):
    """

--- a/ppdet/modeling/layers.py
+++ b/ppdet/modeling/layers.py
@@ -29,8 +29,11 @@ from paddle.regularizer import L2Decay
 from ppdet.core.workspace import register, serializable
 from ppdet.modeling.bbox_utils import delta2bbox
 from . import ops
+from .initializer import xavier_uniform_, constant_

 from paddle.vision.ops import DeformConv2D
+from paddle.nn.layer import transformer
+_convert_attention_mask = transformer._convert_attention_mask


 def _to_list(l):
@@ -1187,3 +1190,179 @@ class Concat(nn.Layer):

    def extra_repr(self):
        return 'dim={}'.format(self.dim)
+
+
+class MultiHeadAttention(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in value. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
+            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 kdim=None,
+                 vdim=None,
+                 need_weights=False):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        if self._qkv_same_embed_dim:
+            self.in_proj_weight = self.create_parameter(
+                shape=[embed_dim, 3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=False)
+            self.in_proj_bias = self.create_parameter(
+                shape=[3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self.q_proj = nn.Linear(embed_dim, embed_dim)
+            self.k_proj = nn.Linear(self.kdim, embed_dim)
+            self.v_proj = nn.Linear(self.vdim, embed_dim)
+
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self._type_list = ('q_proj', 'k_proj', 'v_proj')
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+            else:
+                constant_(p)
+
+    def compute_qkv(self, tensor, index):
+        if self._qkv_same_embed_dim:
+            tensor = F.linear(
+                x=tensor,
+                weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
+                                           * self.embed_dim],
+                bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
+                                       self.embed_dim]
+                if self.in_proj_bias is not None else None)
+        else:
+            tensor = getattr(self, self._type_list[index])(tensor)
+        tensor = tensor.reshape(
+            [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        return tensor
+
+    def forward(self, query, key=None, value=None, attn_mask=None):
+        r"""
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        q, k, v = (self.compute_qkv(t, i)
+                   for i, t in enumerate([query, key, value]))
+
+        # scale dot product attention
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        scaling = float(self.head_dim)**-0.5
+        product = product * scaling
+
+        if attn_mask is not None:
+            # Support bool or int mask
+            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
+            product = product + attn_mask
+        weights = F.softmax(product)
+        if self.dropout:
+            weights = F.dropout(
+                weights,
+                self.dropout,
+                training=self.training,
+                mode="upscale_in_train")
+
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = paddle.transpose(out, perm=[0, 2, 1, 3])
+        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        return out if len(outs) == 1 else tuple(outs)
--- a/ppdet/modeling/losses/__init__.py
+++ b/ppdet/modeling/losses/__init__.py
@@ -22,6 +22,7 @@ from . import ctfocal_loss
 from . import keypoint_loss
 from . import jde_loss
 from . import fairmot_loss
+from . import detr_loss

 from .yolo_loss import *
 from .iou_aware_loss import *
@@ -33,3 +34,4 @@ from .ctfocal_loss import *
 from .keypoint_loss import *
 from .jde_loss import *
 from .fairmot_loss import *
+from .detr_loss import *
--- a/ppdet/modeling/losses/detr_loss.py
+++ b/ppdet/modeling/losses/detr_loss.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from .iou_loss import GIoULoss
+from ..transformers import bbox_cxcywh_to_xyxy, bbox_overlaps, sigmoid_focal_loss
+
+__all__ = ['DETRLoss']
+
+
+@register
+class DETRLoss(nn.Layer):
+    __shared__ = ['num_classes', 'use_focal_loss']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 1,
+                     'bbox': 5,
+                     'giou': 2,
+                     'no_object': 0.1,
+                     'mask': 1,
+                     'dice': 1
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False):
+        r"""
+        Args:
+            num_classes (int): The number of classes.
+            matcher (HungarianMatcher): It computes an assignment between the targets
+                and the predictions of the network.
+            loss_coeff (dict): The coefficient of loss.
+            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
+            use_focal_loss (bool): Use focal loss or not.
+        """
+        super(DETRLoss, self).__init__()
+        self.num_classes = num_classes
+
+        self.matcher = matcher
+        self.loss_coeff = loss_coeff
+        self.aux_loss = aux_loss
+        self.use_focal_loss = use_focal_loss
+
+        if not self.use_focal_loss:
+            self.loss_coeff['class'] = paddle.full([num_classes + 1],
+                                                   loss_coeff['class'])
+            self.loss_coeff['class'][-1] = loss_coeff['no_object']
+        self.giou_loss = GIoULoss()
+
+    def _get_loss_class(self, logits, gt_class, match_indices, bg_index,
+                        num_gts):
+        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
+        target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
+        bs, num_query_objects = target_label.shape
+        if sum(len(a) for a in gt_class) > 0:
+            index, updates = self._get_index_updates(num_query_objects,
+                                                     gt_class, match_indices)
+            target_label = paddle.scatter(
+                target_label.reshape([-1, 1]), index, updates.astype('int64'))
+            target_label = target_label.reshape([bs, num_query_objects])
+        if self.use_focal_loss:
+            target_label = F.one_hot(target_label,
+                                     self.num_classes + 1)[:, :, :-1]
+        return {
+            'loss_class': self.loss_coeff['class'] * sigmoid_focal_loss(
+                logits, target_label, num_gts / num_query_objects)
+            if self.use_focal_loss else F.cross_entropy(
+                logits, target_label, weight=self.loss_coeff['class'])
+        }
+
+    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts):
+        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
+        loss = dict()
+        if sum(len(a) for a in gt_bbox) == 0:
+            loss['loss_bbox'] = paddle.to_tensor([0.])
+            loss['loss_giou'] = paddle.to_tensor([0.])
+            return loss
+
+        src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
+                                                            match_indices)
+        loss['loss_bbox'] = self.loss_coeff['bbox'] * F.l1_loss(
+            src_bbox, target_bbox, reduction='sum') / num_gts
+        loss['loss_giou'] = self.giou_loss(
+            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
+        loss['loss_giou'] = loss['loss_giou'].sum() / num_gts
+        loss['loss_giou'] = self.loss_coeff['giou'] * loss['loss_giou']
+        return loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss['loss_mask'] = paddle.to_tensor([0.])
+            loss['loss_dice'] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        src_masks = F.interpolate(
+            src_masks.unsqueeze(0),
+            size=target_masks.shape[-2:],
+            mode="bilinear")[0]
+        loss['loss_mask'] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
+            src_masks,
+            target_masks,
+            paddle.to_tensor(
+                [num_gts], dtype='float32'))
+        loss['loss_dice'] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _dice_loss(self, inputs, targets, num_gts):
+        inputs = F.sigmoid(inputs)
+        inputs = inputs.flatten(1)
+        targets = targets.flatten(1)
+        numerator = 2 * (inputs * targets).sum(1)
+        denominator = inputs.sum(-1) + targets.sum(-1)
+        loss = 1 - (numerator + 1) / (denominator + 1)
+        return loss.sum() / num_gts
+
+    def _get_loss_aux(self, boxes, logits, gt_bbox, gt_class, bg_index,
+                      num_gts):
+        loss_class = []
+        loss_bbox = []
+        loss_giou = []
+        for aux_boxes, aux_logits in zip(boxes, logits):
+            match_indices = self.matcher(aux_boxes, aux_logits, gt_bbox,
+                                         gt_class)
+            loss_class.append(
+                self._get_loss_class(aux_logits, gt_class, match_indices,
+                                     bg_index, num_gts)['loss_class'])
+            loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
+                                        num_gts)
+            loss_bbox.append(loss_['loss_bbox'])
+            loss_giou.append(loss_['loss_giou'])
+        loss = {
+            'loss_class_aux': paddle.add_n(loss_class),
+            'loss_bbox_aux': paddle.add_n(loss_bbox),
+            'loss_giou_aux': paddle.add_n(loss_giou)
+        }
+        return loss
+
+    def _get_index_updates(self, num_query_objects, target, match_indices):
+        batch_idx = paddle.concat([
+            paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
+        ])
+        src_idx = paddle.concat([src for (src, _) in match_indices])
+        src_idx += (batch_idx * num_query_objects)
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
+        ])
+        return src_idx, target_assign
+
+    def _get_src_target_assign(self, src, target, match_indices):
+        src_assign = paddle.concat([
+            paddle.gather(
+                t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (I, _) in zip(src, match_indices)
+        ])
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (_, J) in zip(target, match_indices)
+        ])
+        return src_assign, target_assign
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None):
+        r"""
+        Args:
+            boxes (Tensor): [l, b, query, 4]
+            logits (Tensor): [l, b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor, optional): [b, query, h, w]
+            gt_mask (List(Tensor), optional): list[[n, H, W]]
+        """
+        match_indices = self.matcher(boxes[-1].detach(), logits[-1].detach(),
+                                     gt_bbox, gt_class)
+        num_gts = sum(len(a) for a in gt_bbox)
+        try:
+            # TODO: Paddle does not have a "paddle.distributed.is_initialized()"
+            num_gts = paddle.to_tensor([num_gts], dtype=paddle.float32)
+            paddle.distributed.all_reduce(num_gts)
+            num_gts = paddle.clip(
+                num_gts / paddle.distributed.get_world_size(), min=1).item()
+        except:
+            num_gts = max(num_gts, 1)
+        total_loss = dict()
+        total_loss.update(
+            self._get_loss_class(logits[-1], gt_class, match_indices,
+                                 self.num_classes, num_gts))
+        total_loss.update(
+            self._get_loss_bbox(boxes[-1], gt_bbox, match_indices, num_gts))
+        if masks is not None and gt_mask is not None:
+            total_loss.update(
+                self._get_loss_mask(masks, gt_mask, match_indices, num_gts))
+
+        if self.aux_loss:
+            total_loss.update(
+                self._get_loss_aux(boxes[:-1], logits[:-1], gt_bbox, gt_class,
+                                   self.num_classes, num_gts))
+
+        return total_loss
--- a/ppdet/modeling/post_process.py
+++ b/ppdet/modeling/post_process.py
@@ -19,18 +19,16 @@ import paddle.nn.functional as F
 from ppdet.core.workspace import register
 from ppdet.modeling.bbox_utils import nonempty_bbox, rbox2poly, rbox2poly
 from ppdet.modeling.layers import TTFBox
+from .transformers import bbox_cxcywh_to_xyxy
 try:
    from collections.abc import Sequence
 except Exception:
    from collections import Sequence

 __all__ = [
-    'BBoxPostProcess',
-    'MaskPostProcess',
-    'FCOSPostProcess',
-    'S2ANetBBoxPostProcess',
-    'JDEBBoxPostProcess',
-    'CenterNetPostProcess',
+    'BBoxPostProcess', 'MaskPostProcess', 'FCOSPostProcess',
+    'S2ANetBBoxPostProcess', 'JDEBBoxPostProcess', 'CenterNetPostProcess',
+    'DETRBBoxPostProcess'
 ]


@@ -492,3 +490,64 @@ class CenterNetPostProcess(TTFBox):
        else:
            results = paddle.concat([clses, scores, bboxes], axis=1)
            return results, paddle.shape(results)[0:1]
+
+
+@register
+class DETRBBoxPostProcess(object):
+    __shared__ = ['num_classes', 'use_focal_loss']
+    __inject__ = []
+
+    def __init__(self,
+                 num_classes=80,
+                 num_top_queries=100,
+                 use_focal_loss=False):
+        super(DETRBBoxPostProcess, self).__init__()
+        self.num_classes = num_classes
+        self.num_top_queries = num_top_queries
+        self.use_focal_loss = use_focal_loss
+
+    def __call__(self, head_out, im_shape, scale_factor):
+        """
+        Decode the bbox.
+
+        Args:
+            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
+            im_shape (Tensor): The shape of the input image.
+            scale_factor (Tensor): The scale factor of the input image.
+        Returns:
+            bbox_pred (Tensor): The output prediction with shape [N, 6], including
+                labels, scores and bboxes. The size of bboxes are corresponding
+                to the input image, the bboxes may be used in other branch.
+            bbox_num (Tensor): The number of prediction boxes of each batch with
+                shape [bs], and is N.
+        """
+        bboxes, logits, masks = head_out
+
+        bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
+        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
+        img_h, img_w = origin_shape.unbind(1)
+        origin_shape = paddle.stack(
+            [img_w, img_h, img_w, img_h], axis=-1).unsqueeze(0)
+        bbox_pred *= origin_shape
+
+        scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
+            logits)[:, :, :-1]
+        scores, labels = scores.max(-1), scores.argmax(-1)
+
+        if scores.shape[1] > self.num_top_queries:
+            scores, index = paddle.topk(scores, self.num_top_queries, axis=-1)
+            labels = paddle.stack(
+                [paddle.gather(l, i) for l, i in zip(labels, index)])
+            bbox_pred = paddle.stack(
+                [paddle.gather(b, i) for b, i in zip(bbox_pred, index)])
+
+        bbox_pred = paddle.concat(
+            [
+                labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
+                bbox_pred
+            ],
+            axis=-1)
+        bbox_num = paddle.to_tensor(
+            bbox_pred.shape[1], dtype='int32').tile([bbox_pred.shape[0]])
+        bbox_pred = bbox_pred.reshape([-1, 6])
+        return bbox_pred, bbox_num
--- a/ppdet/modeling/transformers/__init__.py
+++ b/ppdet/modeling/transformers/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import detr_transformer
+from . import utils
+from . import matchers
+from . import position_encoding
+
+from .detr_transformer import *
+from .utils import *
+from .matchers import *
+from .position_encoding import *
--- a/ppdet/modeling/transformers/detr_transformer.py
+++ b/ppdet/modeling/transformers/detr_transformer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.layer.transformer import _convert_attention_mask
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from .utils import *
+from ..initializer import *
+
+__all__ = ['DETRTransformer']
+
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        src_mask = _convert_attention_mask(src_mask, src.dtype)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        src_mask = _convert_attention_mask(src_mask, src.dtype)
+
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+        memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        q = self.with_pos_embed(tgt, query_pos_embed)
+        k = self.with_pos_embed(memory, pos_embed)
+        tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self,
+                 decoder_layer,
+                 num_layers,
+                 norm=None,
+                 return_intermediate=False):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+        memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                pos_embed=pos_embed,
+                query_pos_embed=query_pos_embed)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+@register
+class DETRTransformer(nn.Layer):
+    __shared__ = ['hidden_dim']
+
+    def __init__(self,
+                 num_queries=100,
+                 position_embed_type='sine',
+                 return_intermediate_dec=True,
+                 backbone_num_channels=2048,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(DETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'],\
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            attn_dropout, act_dropout, normalize_before)
+        encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
+                                          encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            attn_dropout, act_dropout, normalize_before)
+        decoder_norm = nn.LayerNorm(hidden_dim)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec)
+
+        self.input_proj = nn.Conv2D(
+            backbone_num_channels, hidden_dim, kernel_size=1)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+        conv_init_(self.input_proj)
+        normal_(self.query_pos_embed.weight)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'backbone_num_channels': [i.channels for i in input_shape][-1],
+        }
+
+    def forward(self, src, src_mask=None):
+        r"""
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                [bs, H, W]`. When the data type is bool, the unwanted positions
+                have `False` values and the others have `True` values. When the
+                data type is int, the unwanted positions have 0 values and the
+                others have 1 values. When the data type is float, the unwanted
+                positions have `-INF` values and the others have 0 values. It
+                can be None when nothing wanted or needed to be prevented
+                attention to. Default None.
+
+        Returns:
+            output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
+            memory (Tensor): [batch_size, hidden_dim, h, w]
+        """
+        # use last level feature map
+        src_proj = self.input_proj(src[-1])
+        bs, c, h, w = src_proj.shape
+        # flatten [B, C, H, W] to [B, HxW, C]
+        src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
+        if src_mask is not None:
+            src_mask = F.interpolate(
+                src_mask.unsqueeze(0).astype(src_flatten.dtype),
+                size=(h, w))[0].astype('bool')
+        else:
+            src_mask = paddle.ones([bs, h, w], dtype='bool')
+        pos_embed = self.position_embedding(src_mask).flatten(2).transpose(
+            [0, 2, 1])
+
+        src_mask = _convert_attention_mask(src_mask, src_flatten.dtype)
+        src_mask = src_mask.reshape([bs, 1, 1, -1])
+
+        memory = self.encoder(
+            src_flatten, src_mask=src_mask, pos_embed=pos_embed)
+
+        query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
+            [bs, 1, 1])
+        tgt = paddle.zeros_like(query_pos_embed)
+        output = self.decoder(
+            tgt,
+            memory,
+            memory_mask=src_mask,
+            pos_embed=pos_embed,
+            query_pos_embed=query_pos_embed)
+
+        return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
+                src_proj, src_mask.reshape([bs, 1, 1, h, w]))
--- a/ppdet/modeling/transformers/matchers.py
+++ b/ppdet/modeling/transformers/matchers.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+
+from ppdet.core.workspace import register, serializable
+from ..losses.iou_loss import GIoULoss
+from .utils import bbox_cxcywh_to_xyxy
+
+__all__ = ['HungarianMatcher']
+
+
+@register
+@serializable
+class HungarianMatcher(nn.Layer):
+    __shared__ = ['use_focal_loss']
+
+    def __init__(self,
+                 matcher_coeff={'class': 1,
+                                'bbox': 5,
+                                'giou': 2},
+                 use_focal_loss=False,
+                 alpha=0.25,
+                 gamma=2.0):
+        r"""
+        Args:
+            matcher_coeff (dict): The coefficient of hungarian matcher cost.
+        """
+        super(HungarianMatcher, self).__init__()
+        self.matcher_coeff = matcher_coeff
+        self.use_focal_loss = use_focal_loss
+        self.alpha = alpha
+        self.gamma = gamma
+
+        self.giou_loss = GIoULoss()
+
+    def forward(self, boxes, logits, gt_bbox, gt_class):
+        r"""
+        Args:
+            boxes (Tensor): [b, query, 4]
+            logits (Tensor): [b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = boxes.shape[:2]
+
+        num_gts = sum(len(a) for a in gt_class)
+        if num_gts == 0:
+            return [(paddle.to_tensor(
+                [], dtype=paddle.int64), paddle.to_tensor(
+                    [], dtype=paddle.int64)) for _ in range(bs)]
+
+        # We flatten to compute the cost matrices in a batch
+        # [batch_size * num_queries, num_classes]
+        out_prob = F.sigmoid(logits.flatten(
+            0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
+        # [batch_size * num_queries, 4]
+        out_bbox = boxes.flatten(0, 1)
+
+        # Also concat the target labels and boxes
+        tgt_ids = paddle.concat(gt_class).flatten()
+        tgt_bbox = paddle.concat(gt_bbox)
+
+        # Compute the classification cost
+        if self.use_focal_loss:
+            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
+                1 - out_prob + 1e-8).log())
+            pos_cost_class = self.alpha * (
+                (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
+            cost_class = paddle.gather(
+                pos_cost_class, tgt_ids, axis=1) - paddle.gather(
+                    neg_cost_class, tgt_ids, axis=1)
+        else:
+            cost_class = -paddle.gather(out_prob, tgt_ids, axis=1)
+
+        # Compute the L1 cost between boxes
+        cost_bbox = (
+            out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = self.giou_loss(
+            bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
+            bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
+
+        # Final cost matrix
+        C = self.matcher_coeff['class'] * cost_class + self.matcher_coeff['bbox'] * cost_bbox + \
+            self.matcher_coeff['giou'] * cost_giou
+        C = C.reshape([bs, num_queries, -1])
+        C = [a.squeeze(0) for a in C.chunk(bs)]
+
+        sizes = [a.shape[0] for a in gt_bbox]
+        indices = [
+            linear_sum_assignment(c.split(sizes, -1)[i].numpy())
+            for i, c in enumerate(C)
+        ]
+        return [(paddle.to_tensor(
+            i, dtype=paddle.int64), paddle.to_tensor(
+                j, dtype=paddle.int64)) for i, j in indices]
--- a/ppdet/modeling/transformers/position_encoding.py
+++ b/ppdet/modeling/transformers/position_encoding.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+
+from ppdet.core.workspace import register, serializable
+
+
+@register
+@serializable
+class PositionEmbedding(nn.Layer):
+    def __init__(self,
+                 num_pos_feats=128,
+                 temperature=10000,
+                 normalize=True,
+                 scale=None,
+                 embed_type='sine',
+                 num_embeddings=50):
+        super(PositionEmbedding, self).__init__()
+        assert embed_type in ['sine', 'learned']
+
+        self.embed_type = embed_type
+        if self.embed_type == 'sine':
+            self.num_pos_feats = num_pos_feats
+            self.temperature = temperature
+            self.normalize = normalize
+            if scale is not None and normalize is False:
+                raise ValueError("normalize should be True if scale is passed")
+            if scale is None:
+                scale = 2 * math.pi
+            self.scale = scale
+        elif self.embed_type == 'learned':
+            self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
+            self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
+        else:
+            raise ValueError(f"not supported {self.embed_type}")
+
+    def forward(self, mask):
+        """
+        Args:
+            mask (Tensor): [B, H, W]
+        Returns:
+            pos (Tensor): [B, C, H, W]
+        """
+        assert mask.dtype == paddle.bool
+        if self.embed_type == 'sine':
+            mask = mask.astype('float32')
+            y_embed = mask.cumsum(1, dtype='float32')
+            x_embed = mask.cumsum(2, dtype='float32')
+            if self.normalize:
+                eps = 1e-6
+                y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+                x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+            dim_t = 2 * (paddle.arange(self.num_pos_feats) //
+                         2).astype('float32')
+            dim_t = self.temperature**(dim_t / self.num_pos_feats)
+
+            pos_x = x_embed.unsqueeze(-1) / dim_t
+            pos_y = y_embed.unsqueeze(-1) / dim_t
+            pos_x = paddle.stack(
+                (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+                axis=4).flatten(3)
+            pos_y = paddle.stack(
+                (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+                axis=4).flatten(3)
+            pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
+            return pos
+        elif self.embed_type == 'learned':
+            h, w = mask.shape[-2:]
+            i = paddle.arange(w)
+            j = paddle.arange(h)
+            x_emb = self.col_embed(i)
+            y_emb = self.row_embed(j)
+            pos = paddle.concat(
+                [
+                    x_emb.unsqueeze(0).repeat(h, 1, 1),
+                    y_emb.unsqueeze(1).repeat(1, w, 1),
+                ],
+                axis=-1).transpose([2, 0, 1]).unsqueeze(0).tile(mask.shape[0],
+                                                                1, 1, 1)
+            return pos
+        else:
+            raise ValueError(f"not supported {self.embed_type}")
--- a/ppdet/modeling/transformers/utils.py
+++ b/ppdet/modeling/transformers/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..bbox_utils import bbox_overlaps
+
+__all__ = [
+    '_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy',
+    'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss'
+]
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
+
+
+def bbox_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return paddle.stack(b, axis=-1)
+
+
+def bbox_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return paddle.stack(b, axis=-1)
+
+
+def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
+    prob = F.sigmoid(logit)
+    ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
+    p_t = prob * label + (1 - prob) * (1 - label)
+    loss = ce_loss * ((1 - p_t)**gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * label + (1 - alpha) * (1 - label)
+        loss = alpha_t * loss
+    return loss.mean(1).sum() / normalizer if normalizer > 1. else loss.mean(
+        1).sum()
--- a/ppdet/optimizer.py
+++ b/ppdet/optimizer.py
@@ -244,10 +244,11 @@ class OptimizerBuilder():
        optim_args = self.optimizer.copy()
        optim_type = optim_args['type']
        del optim_args['type']
+        if optim_type != 'AdamW':
+            optim_args['weight_decay'] = regularization
        op = getattr(optimizer, optim_type)
        return op(learning_rate=learning_rate,
                  parameters=params,
-                  weight_decay=regularization,
                  grad_clip=grad_clip,
                  **optim_args)