From 0b4dd432ec4f635a691d057c4bde11d4c334f583 Mon Sep 17 00:00:00 2001 From: George Ni Date: Tue, 11 May 2021 21:37:52 +0800 Subject: [PATCH] [MOT] JDE and DeepSORT: model (#2782) * jde and deepsort: model --- ppdet/modeling/__init__.py | 20 +- ppdet/modeling/architectures/__init__.py | 18 +- ppdet/modeling/architectures/deepsort.py | 111 +++++ ppdet/modeling/architectures/jde.py | 120 ++++++ ppdet/modeling/architectures/yolo.py | 48 ++- ppdet/modeling/backbones/darknet.py | 19 + ppdet/modeling/layers.py | 105 +++++ ppdet/modeling/losses/__init__.py | 2 + ppdet/modeling/losses/jde_loss.py | 182 +++++++++ ppdet/modeling/mot/__init__.py | 25 ++ ppdet/modeling/mot/matching/__init__.py | 19 + .../mot/matching/deepsort_matching.py | 379 ++++++++++++++++++ ppdet/modeling/mot/matching/jde_matching.py | 136 +++++++ ppdet/modeling/mot/motion/__init__.py | 17 + ppdet/modeling/mot/motion/kalman_filter.py | 270 +++++++++++++ ppdet/modeling/mot/tracker/__init__.py | 23 ++ .../modeling/mot/tracker/base_jde_tracker.py | 267 ++++++++++++ .../modeling/mot/tracker/base_sde_tracker.py | 145 +++++++ .../modeling/mot/tracker/deepsort_tracker.py | 165 ++++++++ ppdet/modeling/mot/tracker/jde_tracker.py | 244 +++++++++++ ppdet/modeling/mot/utils.py | 181 +++++++++ ppdet/modeling/mot/visualization.py | 126 ++++++ ppdet/modeling/necks/yolo_fpn.py | 72 +++- ppdet/modeling/ops.py | 26 +- ppdet/modeling/post_process.py | 39 +- ppdet/modeling/reid/__init__.py | 21 + ppdet/modeling/reid/jde_embedding_head.py | 187 +++++++++ ppdet/modeling/reid/pyramidal_embedding.py | 138 +++++++ ppdet/modeling/reid/resnet.py | 320 +++++++++++++++ requirements.txt | 2 + 30 files changed, 3388 insertions(+), 39 deletions(-) create mode 100644 ppdet/modeling/architectures/deepsort.py create mode 100644 ppdet/modeling/architectures/jde.py create mode 100644 ppdet/modeling/losses/jde_loss.py create mode 100644 ppdet/modeling/mot/__init__.py create mode 100644 ppdet/modeling/mot/matching/__init__.py create mode 100644 ppdet/modeling/mot/matching/deepsort_matching.py create mode 100644 ppdet/modeling/mot/matching/jde_matching.py create mode 100644 ppdet/modeling/mot/motion/__init__.py create mode 100644 ppdet/modeling/mot/motion/kalman_filter.py create mode 100644 ppdet/modeling/mot/tracker/__init__.py create mode 100644 ppdet/modeling/mot/tracker/base_jde_tracker.py create mode 100644 ppdet/modeling/mot/tracker/base_sde_tracker.py create mode 100644 ppdet/modeling/mot/tracker/deepsort_tracker.py create mode 100644 ppdet/modeling/mot/tracker/jde_tracker.py create mode 100644 ppdet/modeling/mot/utils.py create mode 100644 ppdet/modeling/mot/visualization.py create mode 100644 ppdet/modeling/reid/__init__.py create mode 100644 ppdet/modeling/reid/jde_embedding_head.py create mode 100644 ppdet/modeling/reid/pyramidal_embedding.py create mode 100644 ppdet/modeling/reid/resnet.py diff --git a/ppdet/modeling/__init__.py b/ppdet/modeling/__init__.py index 01968ba3c..f8aed7d60 100644 --- a/ppdet/modeling/__init__.py +++ b/ppdet/modeling/__init__.py @@ -1,5 +1,17 @@ -# OP docs may contains math formula which may cause -# DeprecationWarning in string parsing +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import warnings warnings.filterwarnings( action='ignore', category=DeprecationWarning, module='ops') @@ -13,6 +25,8 @@ from . import losses from . import architectures from . import post_process from . import layers +from . import reid +from . import mot from .ops import * from .backbones import * @@ -23,3 +37,5 @@ from .losses import * from .architectures import * from .post_process import * from .layers import * +from .reid import * +from .mot import * diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py index 66063fdee..50566f7c9 100644 --- a/ppdet/modeling/architectures/__init__.py +++ b/ppdet/modeling/architectures/__init__.py @@ -1,10 +1,10 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 from . import meta_arch from . import faster_rcnn from . import mask_rcnn @@ -17,6 +17,8 @@ from . import ttfnet from . import s2anet from . import keypoint_hrhrnet from . import keypoint_hrnet +from . import jde +from . import deepsort from .meta_arch import * from .faster_rcnn import * @@ -30,3 +32,5 @@ from .ttfnet import * from .s2anet import * from .keypoint_hrhrnet import * from .keypoint_hrnet import * +from .jde import * +from .deepsort import * diff --git a/ppdet/modeling/architectures/deepsort.py b/ppdet/modeling/architectures/deepsort.py new file mode 100644 index 000000000..d5dd13269 --- /dev/null +++ b/ppdet/modeling/architectures/deepsort.py @@ -0,0 +1,111 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch +from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box + +__all__ = ['DeepSORT'] + + +@register +class DeepSORT(BaseArch): + """ + DeepSORT network, see https://arxiv.org/abs/1703.07402 + + Args: + detector (object): detector model instance + reid (object): reid model instance + tracker (object): tracker instance + """ + __category__ = 'architecture' + + def __init__(self, + detector='YOLOv3', + reid='PCBPlusDropoutPyramid', + tracker='DeepSORTTracker'): + super(DeepSORT, self).__init__() + self.detector = detector + self.reid = reid + self.tracker = tracker + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + if cfg['detector'] != 'None': + detector = create(cfg['detector']) + else: + detector = None + reid = create(cfg['reid']) + tracker = create(cfg['tracker']) + + return { + "detector": detector, + "reid": reid, + "tracker": tracker, + } + + def _forward(self): + assert 'ori_image' in self.inputs + load_dets = 'pred_bboxes' in self.inputs and 'pred_scores' in self.inputs + + ori_image = self.inputs['ori_image'] + input_shape = self.inputs['image'].shape[2:] + im_shape = self.inputs['im_shape'] + scale_factor = self.inputs['scale_factor'] + + if self.detector and not load_dets: + outs = self.detector(self.inputs) + if outs['bbox_num'] > 0: + pred_bboxes = scale_coords(outs['bbox'][:, 2:], input_shape, + im_shape, scale_factor) + pred_scores = outs['bbox'][:, 1:2] + else: + pred_bboxes = [] + pred_scores = [] + else: + pred_bboxes = self.inputs['pred_bboxes'] + pred_scores = self.inputs['pred_scores'] + + if len(pred_bboxes) > 0: + pred_bboxes = clip_box(pred_bboxes, input_shape, im_shape, + scale_factor) + bbox_tlwh = paddle.concat( + (pred_bboxes[:, 0:2], + pred_bboxes[:, 2:4] - pred_bboxes[:, 0:2] + 1), + axis=1) + + crops, pred_scores = get_crops( + pred_bboxes, ori_image, pred_scores, w=64, h=192) + + if len(crops) > 0: + features = self.reid(paddle.to_tensor(crops)) + detections = [Detection(bbox_tlwh[i], conf, features[i])\ + for i, conf in enumerate(pred_scores)] + else: + detections = [] + else: + detections = [] + + self.tracker.predict() + online_targets = self.tracker.update(detections) + + return online_targets + + def get_pred(self): + return self._forward() diff --git a/ppdet/modeling/architectures/jde.py b/ppdet/modeling/architectures/jde.py new file mode 100644 index 000000000..f971c42b1 --- /dev/null +++ b/ppdet/modeling/architectures/jde.py @@ -0,0 +1,120 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from ppdet.modeling.mot.utils import scale_coords +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['JDE'] + + +@register +class JDE(BaseArch): + """ + JDE network, see https://arxiv.org/abs/1909.12605v1 + + Args: + detector (object): detector model instance + reid (object): reid model instance + tracker (object): tracker instance + test_mode (str): 'detection', 'embedding' or 'tracking' + """ + __category__ = 'architecture' + + def __init__(self, + detector='YOLOv3', + reid='JDEEmbeddingHead', + tracker='JDETracker', + test_mode='detection'): + super(JDE, self).__init__() + self.detector = detector + self.reid = reid + self.tracker = tracker + self.test_mode = test_mode + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + detector = create(cfg['detector']) + kwargs = {'input_shape': detector.neck.out_shape} + + reid = create(cfg['reid'], **kwargs) + + tracker = create(cfg['tracker']) + + return { + "detector": detector, + "reid": reid, + "tracker": tracker, + } + + def _forward(self): + det_outs = self.detector(self.inputs) + + if self.training: + emb_feats = det_outs['emb_feats'] + loss_confs = det_outs['det_losses']['loss_confs'] + loss_boxes = det_outs['det_losses']['loss_boxes'] + jde_losses = self.reid(emb_feats, self.inputs, loss_confs, + loss_boxes) + return jde_losses + else: + if self.test_mode == 'detection': + det_results = { + 'bbox': det_outs['bbox'], + 'bbox_num': det_outs['bbox_num'], + } + return det_results + + elif self.test_mode == 'embedding': + emb_feats = det_outs['emb_feats'] + embs_and_gts = self.reid(emb_feats, self.inputs, test_emb=True) + return embs_and_gts + + elif self.test_mode == 'tracking': + emb_feats = det_outs['emb_feats'] + emb_outs = self.reid(emb_feats, self.inputs) + + boxes_idx = det_outs['boxes_idx'] + bbox = det_outs['bbox'] + + input_shape = self.inputs['image'].shape[2:] + im_shape = self.inputs['im_shape'] + scale_factor = self.inputs['scale_factor'] + + bbox[:, 2:] = scale_coords(bbox[:, 2:], input_shape, im_shape, + scale_factor) + + nms_keep_idx = det_outs['nms_keep_idx'] + + pred_dets = paddle.concat((bbox[:, 2:], bbox[:, 1:2]), axis=1) + + emb_valid = paddle.gather_nd(emb_outs, boxes_idx) + pred_embs = paddle.gather_nd(emb_valid, nms_keep_idx) + + online_targets = self.tracker.update(pred_dets, pred_embs) + return online_targets + + else: + raise ValueError("Unknown test_mode {}.".format(self.test_mode)) + + def get_loss(self): + return self._forward() + + def get_pred(self): + return self._forward() diff --git a/ppdet/modeling/architectures/yolo.py b/ppdet/modeling/architectures/yolo.py index 6c0444480..276f86e9f 100644 --- a/ppdet/modeling/architectures/yolo.py +++ b/ppdet/modeling/architectures/yolo.py @@ -19,7 +19,8 @@ class YOLOv3(BaseArch): neck='YOLOv3FPN', yolo_head='YOLOv3Head', post_process='BBoxPostProcess', - data_format='NCHW'): + data_format='NCHW', + for_mot=False): """ YOLOv3 network, see https://arxiv.org/abs/1804.02767 @@ -29,12 +30,14 @@ class YOLOv3(BaseArch): yolo_head (nn.Layer): anchor_head instance bbox_post_process (object): `BBoxPostProcess` instance data_format (str): data format, NCHW or NHWC + for_mot (bool): whether return other features used in tracking model """ super(YOLOv3, self).__init__(data_format=data_format) self.backbone = backbone self.neck = neck self.yolo_head = yolo_head self.post_process = post_process + self.for_mot = for_mot @classmethod def from_config(cls, cfg, *args, **kwargs): @@ -57,21 +60,44 @@ class YOLOv3(BaseArch): def _forward(self): body_feats = self.backbone(self.inputs) - body_feats = self.neck(body_feats) + neck_feats = self.neck(body_feats, self.for_mot) + + if isinstance(neck_feats, dict): + assert self.for_mot == True + emb_feats = neck_feats['emb_feats'] + neck_feats = neck_feats['yolo_feats'] if self.training: - return self.yolo_head(body_feats, self.inputs) + yolo_losses = self.yolo_head(neck_feats, self.inputs) + + if self.for_mot: + return {'det_losses': yolo_losses, 'emb_feats': emb_feats} + else: + return yolo_losses + else: - yolo_head_outs = self.yolo_head(body_feats) - bbox, bbox_num = self.post_process( - yolo_head_outs, self.yolo_head.mask_anchors, - self.inputs['im_shape'], self.inputs['scale_factor']) - return bbox, bbox_num + yolo_head_outs = self.yolo_head(neck_feats) + + if self.for_mot: + boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process( + yolo_head_outs, self.yolo_head.mask_anchors) + output = { + 'bbox': bbox, + 'bbox_num': bbox_num, + 'boxes_idx': boxes_idx, + 'nms_keep_idx': nms_keep_idx, + 'emb_feats': emb_feats, + } + else: + bbox, bbox_num = self.post_process( + yolo_head_outs, self.yolo_head.mask_anchors, + self.inputs['im_shape'], self.inputs['scale_factor']) + output = {'bbox': bbox, 'bbox_num': bbox_num} + + return output def get_loss(self): return self._forward() def get_pred(self): - bbox_pred, bbox_num = self._forward() - output = {'bbox': bbox_pred, 'bbox_num': bbox_num} - return output + return self._forward() diff --git a/ppdet/modeling/backbones/darknet.py b/ppdet/modeling/backbones/darknet.py index 8d3d07a25..9c78dbbb1 100755 --- a/ppdet/modeling/backbones/darknet.py +++ b/ppdet/modeling/backbones/darknet.py @@ -35,6 +35,7 @@ class ConvBNLayer(nn.Layer): norm_type='bn', norm_decay=0., act="leaky", + freeze_norm=False, data_format='NCHW', name=''): """ @@ -50,6 +51,7 @@ class ConvBNLayer(nn.Layer): norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. act (str): activation function type, default 'leaky', which means leaky_relu + freeze_norm (bool): whether to freeze norm, default False data_format (str): data format, NCHW or NHWC """ super(ConvBNLayer, self).__init__() @@ -67,6 +69,7 @@ class ConvBNLayer(nn.Layer): ch_out, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format) self.act = act @@ -89,6 +92,7 @@ class DownSample(nn.Layer): padding=1, norm_type='bn', norm_decay=0., + freeze_norm=False, data_format='NCHW'): """ downsample layer @@ -101,6 +105,7 @@ class DownSample(nn.Layer): padding (int): padding size, default 1 norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. + freeze_norm (bool): whether to freeze norm, default False data_format (str): data format, NCHW or NHWC """ @@ -114,6 +119,7 @@ class DownSample(nn.Layer): padding=padding, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format) self.ch_out = ch_out @@ -128,6 +134,7 @@ class BasicBlock(nn.Layer): ch_out, norm_type='bn', norm_decay=0., + freeze_norm=False, data_format='NCHW'): """ BasicBlock layer of DarkNet @@ -137,6 +144,7 @@ class BasicBlock(nn.Layer): ch_out (int): output channel norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. + freeze_norm (bool): whether to freeze norm, default False data_format (str): data format, NCHW or NHWC """ @@ -150,6 +158,7 @@ class BasicBlock(nn.Layer): padding=0, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format) self.conv2 = ConvBNLayer( ch_in=ch_out, @@ -159,6 +168,7 @@ class BasicBlock(nn.Layer): padding=1, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format) def forward(self, inputs): @@ -175,6 +185,7 @@ class Blocks(nn.Layer): count, norm_type='bn', norm_decay=0., + freeze_norm=False, name=None, data_format='NCHW'): """ @@ -186,6 +197,7 @@ class Blocks(nn.Layer): count (int): number of BasicBlock layer norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. + freeze_norm (bool): whether to freeze norm, default False name (str): layer name data_format (str): data format, NCHW or NHWC """ @@ -196,6 +208,7 @@ class Blocks(nn.Layer): ch_out, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format) self.res_out_list = [] for i in range(1, count): @@ -207,6 +220,7 @@ class Blocks(nn.Layer): ch_out, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format)) self.res_out_list.append(res_out) self.ch_out = ch_out @@ -233,6 +247,7 @@ class DarkNet(nn.Layer): num_stages=5, norm_type='bn', norm_decay=0., + freeze_norm=False, data_format='NCHW'): """ Darknet, see https://pjreddie.com/darknet/yolo/ @@ -261,6 +276,7 @@ class DarkNet(nn.Layer): padding=1, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format) self.downsample0 = DownSample( @@ -268,6 +284,7 @@ class DarkNet(nn.Layer): ch_out=32 * 2, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format) self._out_channels = [] @@ -284,6 +301,7 @@ class DarkNet(nn.Layer): stage, norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format, name=name)) self.darknet_conv_block_list.append(conv_block) @@ -298,6 +316,7 @@ class DarkNet(nn.Layer): ch_out=32 * (2**(i + 2)), norm_type=norm_type, norm_decay=norm_decay, + freeze_norm=freeze_norm, data_format=data_format)) self.downsample_list.append(downsample) diff --git a/ppdet/modeling/layers.py b/ppdet/modeling/layers.py index 57fb0a99f..b6c13f38a 100644 --- a/ppdet/modeling/layers.py +++ b/ppdet/modeling/layers.py @@ -836,6 +836,111 @@ class TTFBox(object): return results, paddle.shape(results)[0:1] +@register +@serializable +class JDEBox(object): + __shared__ = ['num_classes'] + + def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32): + self.num_classes = num_classes + self.conf_thresh = conf_thresh + self.downsample_ratio = downsample_ratio + + def generate_anchor(self, nGh, nGw, anchor_wh): + nA = len(anchor_wh) + yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)]) + mesh = paddle.stack( + (xv, yv), axis=0).cast(dtype='float32') # 2 x nGh x nGw + meshs = paddle.tile(mesh, [nA, 1, 1, 1]) + + anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat( + int(nGh), axis=-2).repeat( + int(nGw), axis=-1) + anchor_offset_mesh = paddle.to_tensor( + anchor_offset_mesh.astype(np.float32)) + # nA x 2 x nGh x nGw + + anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1) + anchor_mesh = paddle.transpose(anchor_mesh, + [0, 2, 3, 1]) # (nA x nGh x nGw) x 4 + return anchor_mesh + + def decode_delta(self, delta, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3] + gx = pw * dx + px + gy = ph * dy + py + gw = pw * paddle.exp(dw) + gh = ph * paddle.exp(dh) + gx1 = gx - gw * 0.5 + gy1 = gy - gh * 0.5 + gx2 = gx + gw * 0.5 + gy2 = gy + gh * 0.5 + return paddle.stack([gx1, gy1, gx2, gy2], axis=1) + + def decode_delta_map(self, delta_map, anchors): + delta_map_shape = paddle.shape(delta_map) + delta_map_shape.stop_gradient = True + nB, nA, nGh, nGw, _ = delta_map_shape[:] + anchor_mesh = self.generate_anchor(nGh, nGw, anchors) + # only support bs=1 + anchor_mesh = paddle.unsqueeze(anchor_mesh, 0) + + pred_list = self.decode_delta( + paddle.reshape( + delta_map, shape=[-1, 4]), + paddle.reshape( + anchor_mesh, shape=[-1, 4])) + pred_map = paddle.reshape(pred_list, shape=[nB, -1, 4]) + return pred_map + + def __call__(self, yolo_head_out, anchors): + bbox_pred_list = [] + for i, head_out in enumerate(yolo_head_out): + stride = self.downsample_ratio // 2**i + anc_w, anc_h = anchors[i][0::2], anchors[i][1::2] + anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride + nA = len(anc_w) + boxes_shape = paddle.shape(head_out) + boxes_shape.stop_gradient = True + nB, nGh, nGw = boxes_shape[0], boxes_shape[-2], boxes_shape[-1] + + p = head_out.reshape((nB, nA, self.num_classes + 5, nGh, nGw)) + p = paddle.transpose(p, perm=[0, 1, 3, 4, 2]) # [nB, 4, 19, 34, 6] + p_box = p[:, :, :, :, :4] # [nB, 4, 19, 34, 4] + boxes = self.decode_delta_map(p_box, anchor_vec) # [nB, 4*19*34, 4] + boxes = boxes * stride + + p_conf = paddle.transpose( + p[:, :, :, :, 4:6], perm=[0, 4, 1, 2, 3]) # [nB, 2, 4, 19, 34] + p_conf = F.softmax( + p_conf, + axis=1)[:, 1, :, :, :].unsqueeze(-1) # [nB, 4, 19, 34, 1] + scores = paddle.reshape(p_conf, shape=[nB, -1, 1]) + + bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1)) + + yolo_boxes_pred = paddle.concat(bbox_pred_list, axis=1) + boxes_idx = paddle.nonzero(yolo_boxes_pred[:, :, -1] > self.conf_thresh) + boxes_idx.stop_gradient = True + if boxes_idx.shape[0] == 0: # TODO: deploy + boxes_idx = paddle.to_tensor(np.array([[0]], dtype='int64')) + yolo_boxes_out = paddle.to_tensor( + np.array( + [[[0.0, 0.0, 0.0, 0.0]]], dtype='float32')) + yolo_scores_out = paddle.to_tensor( + np.array( + [[[0.0]]], dtype='float32')) + return boxes_idx, yolo_boxes_out, yolo_scores_out + + yolo_boxes = paddle.gather_nd(yolo_boxes_pred, boxes_idx) + yolo_boxes_out = paddle.reshape(yolo_boxes[:, :4], shape=[nB, -1, 4]) + yolo_scores_out = paddle.reshape(yolo_boxes[:, 4:5], shape=[nB, 1, -1]) + boxes_idx = boxes_idx[:, 1:] + return boxes_idx, yolo_boxes_out, yolo_scores_out # [163], [1, 163, 4], [1, 1, 163] + + @register @serializable class MaskMatrixNMS(object): diff --git a/ppdet/modeling/losses/__init__.py b/ppdet/modeling/losses/__init__.py index f4c914516..4cbef1cde 100644 --- a/ppdet/modeling/losses/__init__.py +++ b/ppdet/modeling/losses/__init__.py @@ -20,6 +20,7 @@ from . import fcos_loss from . import solov2_loss from . import ctfocal_loss from . import keypoint_loss +from . import jde_loss from .yolo_loss import * from .iou_aware_loss import * @@ -29,3 +30,4 @@ from .fcos_loss import * from .solov2_loss import * from .ctfocal_loss import * from .keypoint_loss import * +from .jde_loss import * diff --git a/ppdet/modeling/losses/jde_loss.py b/ppdet/modeling/losses/jde_loss.py new file mode 100644 index 000000000..59ace08f2 --- /dev/null +++ b/ppdet/modeling/losses/jde_loss.py @@ -0,0 +1,182 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register + +__all__ = ['JDEDetectionLoss', 'JDEEmbeddingLoss', 'JDELoss'] + + +@register +class JDEDetectionLoss(nn.Layer): + __shared__ = ['num_classes'] + + def __init__(self, num_classes=1): + super(JDEDetectionLoss, self).__init__() + self.num_classes = num_classes + + def det_loss(self, p_det, anchor, t_conf, t_box): + pshape = paddle.shape(p_det) + pshape.stop_gradient = True + nB, nGh, nGw = pshape[0], pshape[-2], pshape[-1] + nA = len(anchor) + p_det = paddle.reshape( + p_det, [nB, nA, self.num_classes + 5, nGh, nGw]).transpose( + (0, 1, 3, 4, 2)) + + # 1. loss_conf: cross_entropy + p_conf = p_det[:, :, :, :, 4:6] + p_conf_flatten = paddle.reshape(p_conf, [-1, 2]) + t_conf_flatten = t_conf.flatten() + t_conf_flatten = paddle.cast(t_conf_flatten, dtype="int64") + t_conf_flatten.stop_gradient = True + loss_conf = F.cross_entropy( + p_conf_flatten, t_conf_flatten, ignore_index=-1, reduction='mean') + loss_conf.stop_gradient = False + + # 2. loss_box: smooth_l1_loss + p_box = p_det[:, :, :, :, :4] + p_box_flatten = paddle.reshape(p_box, [-1, 4]) + t_box_flatten = paddle.reshape(t_box, [-1, 4]) + fg_inds = paddle.nonzero(t_conf_flatten > 0).flatten() + if fg_inds.numel() > 0: + reg_delta = paddle.gather(p_box_flatten, fg_inds) + reg_target = paddle.gather(t_box_flatten, fg_inds) + else: + reg_delta = paddle.to_tensor([0, 0, 0, 0], dtype='float32') + reg_delta.stop_gradient = False + reg_target = paddle.to_tensor([0, 0, 0, 0], dtype='float32') + reg_target.stop_gradient = True + loss_box = F.smooth_l1_loss( + reg_delta, reg_target, reduction='mean', delta=1.0) + loss_box.stop_gradient = False + + return loss_conf, loss_box + + def forward(self, det_outs, targets, anchors): + """ + Args: + det_outs (list[Tensor]): output from detection head, each one + is a 4-D Tensor with shape [N, C, H, W]. + targets (dict): contains 'im_id', 'gt_bbox', 'gt_ide', 'image', + 'im_shape', 'scale_factor' and 'tbox', 'tconf', 'tide' of + each FPN level. + anchors (list[list]): anchor setting of JDE model, N row M col, N is + the anchor levels(FPN levels), M is the anchor scales each + level. + """ + assert len(det_outs) == len(anchors) + loss_confs = [] + loss_boxes = [] + for i, (p_det, anchor) in enumerate(zip(det_outs, anchors)): + t_conf = targets['tconf{}'.format(i)] + t_box = targets['tbox{}'.format(i)] + + loss_conf, loss_box = self.det_loss(p_det, anchor, t_conf, t_box) + loss_confs.append(loss_conf) + loss_boxes.append(loss_box) + return {'loss_confs': loss_confs, 'loss_boxes': loss_boxes} + + +@register +class JDEEmbeddingLoss(nn.Layer): + def __init__(self, ): + super(JDEEmbeddingLoss, self).__init__() + self.phony = self.create_parameter(shape=[1], dtype="float32") + + def emb_loss(self, p_ide, t_conf, t_ide, emb_scale, classifier): + emb_dim = p_ide.shape[1] + p_ide = p_ide.transpose((0, 2, 3, 1)) + p_ide_flatten = paddle.reshape(p_ide, [-1, emb_dim]) + mask = t_conf > 0 + mask = paddle.cast(mask, dtype="int64") + mask.stop_gradient = True + emb_mask = mask.max(1).flatten() + emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten() + emb_mask_inds.stop_gradient = True + # use max(1) to decide the id, TODO: more reseanable strategy + t_ide_flatten = t_ide.max(1).flatten() + t_ide_flatten = paddle.cast(t_ide_flatten, dtype="int64") + valid_inds = paddle.nonzero(t_ide_flatten != -1).flatten() + + if emb_mask_inds.numel() == 0 or valid_inds.numel() == 0: + # loss_ide = paddle.to_tensor([0]) # will be error in gradient backward + loss_ide = self.phony * 0 # todo + else: + embedding = paddle.gather(p_ide_flatten, emb_mask_inds) + embedding = emb_scale * F.normalize(embedding) + logits = classifier(embedding) + + ide_target = paddle.gather(t_ide_flatten, emb_mask_inds) + + loss_ide = F.cross_entropy( + logits, ide_target, ignore_index=-1, reduction='mean') + loss_ide.stop_gradient = False + + return loss_ide + + def forward(self, ide_outs, targets, emb_scale, classifier): + loss_ides = [] + for i, p_ide in enumerate(ide_outs): + t_conf = targets['tconf{}'.format(i)] + t_ide = targets['tide{}'.format(i)] + + loss_ide = self.emb_loss(p_ide, t_conf, t_ide, emb_scale, + classifier) + loss_ides.append(loss_ide) + return loss_ides + + +@register +class JDELoss(nn.Layer): + def __init__(self): + super(JDELoss, self).__init__() + + def forward(self, loss_confs, loss_boxes, loss_ides, loss_params_cls, + loss_params_reg, loss_params_ide, targets): + assert len(loss_confs) == len(loss_boxes) == len(loss_ides) + assert len(loss_params_cls) == len(loss_params_reg) == len( + loss_params_ide) + assert len(loss_confs) == len(loss_params_cls) + + batchsize = targets['gt_bbox'].shape[0] + nTargets = paddle.nonzero(paddle.sum(targets['gt_bbox'], axis=2)).shape[ + 0] / batchsize + nTargets = paddle.to_tensor(nTargets, dtype='float32') + nTargets.stop_gradient = True + + jde_losses = [] + for i, (loss_conf, loss_box, loss_ide, l_conf_p, l_box_p, + l_ide_p) in enumerate( + zip(loss_confs, loss_boxes, loss_ides, loss_params_cls, + loss_params_reg, loss_params_ide)): + + jde_loss = l_conf_p(loss_conf) + l_box_p(loss_box) + l_ide_p( + loss_ide) + jde_losses.append(jde_loss) + + loss_all = { + "loss_conf": sum(loss_confs), + "loss_box": sum(loss_boxes), + "loss_ide": sum(loss_ides), + "loss": sum(jde_losses), + "nTargets": nTargets, + } + return loss_all diff --git a/ppdet/modeling/mot/__init__.py b/ppdet/modeling/mot/__init__.py new file mode 100644 index 000000000..258e4c901 --- /dev/null +++ b/ppdet/modeling/mot/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import matching +from . import tracker +from . import motion +from . import visualization +from . import utils + +from .matching import * +from .tracker import * +from .motion import * +from .visualization import * +from .utils import * diff --git a/ppdet/modeling/mot/matching/__init__.py b/ppdet/modeling/mot/matching/__init__.py new file mode 100644 index 000000000..54c6680f7 --- /dev/null +++ b/ppdet/modeling/mot/matching/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import jde_matching +from . import deepsort_matching + +from .jde_matching import * +from .deepsort_matching import * diff --git a/ppdet/modeling/mot/matching/deepsort_matching.py b/ppdet/modeling/mot/matching/deepsort_matching.py new file mode 100644 index 000000000..18dc911ce --- /dev/null +++ b/ppdet/modeling/mot/matching/deepsort_matching.py @@ -0,0 +1,379 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is borrow from https://github.com/nwojke/deep_sort/tree/master/deep_sort +""" + +import numpy as np +from scipy.optimize import linear_sum_assignment +from ..motion import kalman_filter + +INFTY_COST = 1e+5 + +__all__ = [ + 'iou_1toN', + 'iou_cost', + '_nn_euclidean_distance', + '_nn_cosine_distance', + 'NearestNeighborDistanceMetric', + 'min_cost_matching', + 'matching_cascade', + 'gate_cost_matrix', +] + + +def iou_1toN(bbox, candidates): + """ + Computer intersection over union (IoU) by one box to N candidates. + + Args: + bbox (ndarray): A bounding box in format `(top left x, top left y, width, height)`. + candidates (ndarray): A matrix of candidate bounding boxes (one per row) in the + same format as `bbox`. + + Returns: + ious (ndarray): The intersection over union in [0, 1] between the `bbox` + and each candidate. A higher score means a larger fraction of the + `bbox` is occluded by the candidate. + """ + bbox_tl = bbox[:2] + bbox_br = bbox[:2] + bbox[2:] + candidates_tl = candidates[:, :2] + candidates_br = candidates[:, :2] + candidates[:, 2:] + + tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], + np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] + br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], + np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] + wh = np.maximum(0., br - tl) + + area_intersection = wh.prod(axis=1) + area_bbox = bbox[2:].prod() + area_candidates = candidates[:, 2:].prod(axis=1) + ious = area_intersection / (area_bbox + area_candidates - area_intersection) + return ious + + +def iou_cost(tracks, detections, track_indices=None, detection_indices=None): + """ + IoU distance metric. + + Args: + tracks (list[Track]): A list of tracks. + detections (list[Detection]): A list of detections. + track_indices (Optional[list[int]]): A list of indices to tracks that + should be matched. Defaults to all `tracks`. + detection_indices (Optional[list[int]]): A list of indices to detections + that should be matched. Defaults to all `detections`. + + Returns: + cost_matrix (ndarray): A cost matrix of shape len(track_indices), + len(detection_indices) where entry (i, j) is + `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. + """ + if track_indices is None: + track_indices = np.arange(len(tracks)) + if detection_indices is None: + detection_indices = np.arange(len(detections)) + + cost_matrix = np.zeros((len(track_indices), len(detection_indices))) + for row, track_idx in enumerate(track_indices): + if tracks[track_idx].time_since_update > 1: + cost_matrix[row, :] = 1e+5 + continue + + bbox = tracks[track_idx].to_tlwh() + candidates = np.asarray([detections[i].tlwh for i in detection_indices]) + cost_matrix[row, :] = 1. - iou_1toN(bbox, candidates) + return cost_matrix + + +def _nn_euclidean_distance(s, q): + """ + Compute pair-wise squared (Euclidean) distance between points in `s` and `q`. + + Args: + s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M. + q (ndarray): Query points: an LxM matrix of L samples of dimensionality M. + + Returns: + distances (ndarray): A vector of length M that contains for each entry in `q` the + smallest Euclidean distance to a sample in `s`. + """ + s, q = np.asarray(s), np.asarray(q) + if len(s) == 0 or len(q) == 0: + return np.zeros((len(s), len(q))) + s2, q2 = np.square(s).sum(axis=1), np.square(q).sum(axis=1) + distances = -2. * np.dot(s, q.T) + s2[:, None] + q2[None, :] + distances = np.clip(distances, 0., float(np.inf)) + + return np.maximum(0.0, distances.min(axis=0)) + + +def _nn_cosine_distance(s, q): + """ + Compute pair-wise cosine distance between points in `s` and `q`. + + Args: + s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M. + q (ndarray): Query points: an LxM matrix of L samples of dimensionality M. + + Returns: + distances (ndarray): A vector of length M that contains for each entry in `q` the + smallest Euclidean distance to a sample in `s`. + """ + s = np.asarray(s) / np.linalg.norm(s, axis=1, keepdims=True) + q = np.asarray(q) / np.linalg.norm(q, axis=1, keepdims=True) + distances = 1. - np.dot(s, q.T) + + return distances.min(axis=0) + + +class NearestNeighborDistanceMetric(object): + """ + A nearest neighbor distance metric that, for each target, returns + the closest distance to any sample that has been observed so far. + + Args: + metric (str): Either "euclidean" or "cosine". + matching_threshold (float): The matching threshold. Samples with larger + distance are considered an invalid match. + budget (Optional[int]): If not None, fix samples per class to at most + this number. Removes the oldest samples when the budget is reached. + + Attributes: + samples (Dict[int -> List[ndarray]]): A dictionary that maps from target + identities to the list of samples that have been observed so far. + """ + + def __init__(self, metric, matching_threshold, budget=None): + if metric == "euclidean": + self._metric = _nn_euclidean_distance + elif metric == "cosine": + self._metric = _nn_cosine_distance + else: + raise ValueError( + "Invalid metric; must be either 'euclidean' or 'cosine'") + self.matching_threshold = matching_threshold + self.budget = budget + self.samples = {} + + def partial_fit(self, features, targets, active_targets): + """ + Update the distance metric with new data. + + Args: + features (ndarray): An NxM matrix of N features of dimensionality M. + targets (ndarray): An integer array of associated target identities. + active_targets (List[int]): A list of targets that are currently + present in the scene. + """ + for feature, target in zip(features, targets): + self.samples.setdefault(target, []).append(feature) + if self.budget is not None: + self.samples[target] = self.samples[target][-self.budget:] + self.samples = {k: self.samples[k] for k in active_targets} + + def distance(self, features, targets): + """ + Compute distance between features and targets. + + Args: + features (ndarray): An NxM matrix of N features of dimensionality M. + targets (list[int]): A list of targets to match the given `features` against. + + Returns: + cost_matrix (ndarray): a cost matrix of shape len(targets), len(features), + where element (i, j) contains the closest squared distance between + `targets[i]` and `features[j]`. + """ + cost_matrix = np.zeros((len(targets), len(features))) + for i, target in enumerate(targets): + cost_matrix[i, :] = self._metric(self.samples[target], features) + return cost_matrix + + +def min_cost_matching(distance_metric, + max_distance, + tracks, + detections, + track_indices=None, + detection_indices=None): + """ + Solve linear assignment problem. + + Args: + distance_metric : + Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray + The distance metric is given a list of tracks and detections as + well as a list of N track indices and M detection indices. The + metric should return the NxM dimensional cost matrix, where element + (i, j) is the association cost between the i-th track in the given + track indices and the j-th detection in the given detection_indices. + max_distance (float): Gating threshold. Associations with cost larger + than this value are disregarded. + tracks (list[Track]): A list of predicted tracks at the current time + step. + detections (list[Detection]): A list of detections at the current time + step. + track_indices (list[int]): List of track indices that maps rows in + `cost_matrix` to tracks in `tracks`. + detection_indices (List[int]): List of detection indices that maps + columns in `cost_matrix` to detections in `detections`. + + Returns: + A tuple (List[(int, int)], List[int], List[int]) with the following + three entries: + * A list of matched track and detection indices. + * A list of unmatched track indices. + * A list of unmatched detection indices. + """ + if track_indices is None: + track_indices = np.arange(len(tracks)) + if detection_indices is None: + detection_indices = np.arange(len(detections)) + + if len(detection_indices) == 0 or len(track_indices) == 0: + return [], track_indices, detection_indices # Nothing to match. + + cost_matrix = distance_metric(tracks, detections, track_indices, + detection_indices) + + cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 + indices = linear_sum_assignment(cost_matrix) + + matches, unmatched_tracks, unmatched_detections = [], [], [] + for col, detection_idx in enumerate(detection_indices): + if col not in indices[1]: + unmatched_detections.append(detection_idx) + for row, track_idx in enumerate(track_indices): + if row not in indices[0]: + unmatched_tracks.append(track_idx) + for row, col in zip(indices[0], indices[1]): + track_idx = track_indices[row] + detection_idx = detection_indices[col] + if cost_matrix[row, col] > max_distance: + unmatched_tracks.append(track_idx) + unmatched_detections.append(detection_idx) + else: + matches.append((track_idx, detection_idx)) + return matches, unmatched_tracks, unmatched_detections + + +def matching_cascade(distance_metric, + max_distance, + cascade_depth, + tracks, + detections, + track_indices=None, + detection_indices=None): + """ + Run matching cascade. + + Args: + distance_metric : + Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray + The distance metric is given a list of tracks and detections as + well as a list of N track indices and M detection indices. The + metric should return the NxM dimensional cost matrix, where element + (i, j) is the association cost between the i-th track in the given + track indices and the j-th detection in the given detection_indices. + max_distance (float): Gating threshold. Associations with cost larger + than this value are disregarded. + cascade_depth (int): The cascade depth, should be se to the maximum + track age. + tracks (list[Track]): A list of predicted tracks at the current time + step. + detections (list[Detection]): A list of detections at the current time + step. + track_indices (list[int]): List of track indices that maps rows in + `cost_matrix` to tracks in `tracks`. + detection_indices (List[int]): List of detection indices that maps + columns in `cost_matrix` to detections in `detections`. + + Returns: + A tuple (List[(int, int)], List[int], List[int]) with the following + three entries: + * A list of matched track and detection indices. + * A list of unmatched track indices. + * A list of unmatched detection indices. + """ + if track_indices is None: + track_indices = list(range(len(tracks))) + if detection_indices is None: + detection_indices = list(range(len(detections))) + + unmatched_detections = detection_indices + matches = [] + for level in range(cascade_depth): + if len(unmatched_detections) == 0: # No detections left + break + + track_indices_l = [ + k for k in track_indices if tracks[k].time_since_update == 1 + level + ] + if len(track_indices_l) == 0: # Nothing to match at this level + continue + + matches_l, _, unmatched_detections = \ + min_cost_matching( + distance_metric, max_distance, tracks, detections, + track_indices_l, unmatched_detections) + matches += matches_l + unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) + return matches, unmatched_tracks, unmatched_detections + + +def gate_cost_matrix(kf, + cost_matrix, + tracks, + detections, + track_indices, + detection_indices, + gated_cost=INFTY_COST, + only_position=False): + """ + Invalidate infeasible entries in cost matrix based on the state + distributions obtained by Kalman filtering. + + Args: + kf (object): The Kalman filter. + cost_matrix (ndarray): The NxM dimensional cost matrix, where N is the + number of track indices and M is the number of detection indices, + such that entry (i, j) is the association cost between + `tracks[track_indices[i]]` and `detections[detection_indices[j]]`. + tracks (list[Track]): A list of predicted tracks at the current time + step. + detections (list[Detection]): A list of detections at the current time + step. + track_indices (List[int]): List of track indices that maps rows in + `cost_matrix` to tracks in `tracks`. + detection_indices (List[int]): List of detection indices that maps + columns in `cost_matrix` to detections in `detections`. + gated_cost (Optional[float]): Entries in the cost matrix corresponding + to infeasible associations are set this value. Defaults to a very + large value. + only_position (Optional[bool]): If True, only the x, y position of the + state distribution is considered during gating. Default False. + """ + gating_dim = 2 if only_position else 4 + gating_threshold = kalman_filter.chi2inv95[gating_dim] + measurements = np.asarray( + [detections[i].to_xyah() for i in detection_indices]) + for row, track_idx in enumerate(track_indices): + track = tracks[track_idx] + gating_distance = kf.gating_distance(track.mean, track.covariance, + measurements, only_position) + cost_matrix[row, gating_distance > gating_threshold] = gated_cost + return cost_matrix diff --git a/ppdet/modeling/mot/matching/jde_matching.py b/ppdet/modeling/mot/matching/jde_matching.py new file mode 100644 index 000000000..7cf0e3850 --- /dev/null +++ b/ppdet/modeling/mot/matching/jde_matching.py @@ -0,0 +1,136 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is borrow from https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/matching.py +""" + +import scipy +import numpy as np +from scipy.spatial.distance import cdist +from ..motion import kalman_filter + +__all__ = [ + 'merge_matches', + 'linear_assignment', + 'cython_bbox_ious', + 'iou_distance', + 'embedding_distance', + 'fuse_motion', +] + + +def merge_matches(m1, m2, shape): + O, P, Q = shape + m1 = np.asarray(m1) + m2 = np.asarray(m2) + + M1 = scipy.sparse.coo_matrix( + (np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) + M2 = scipy.sparse.coo_matrix( + (np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) + + mask = M1 * M2 + match = mask.nonzero() + match = list(zip(match[0], match[1])) + unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) + unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) + + return match, unmatched_O, unmatched_Q + + +def linear_assignment(cost_matrix, thresh): + if cost_matrix.size == 0: + return np.empty( + (0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple( + range(cost_matrix.shape[1])) + matches, unmatched_a, unmatched_b = [], [], [] + import lap + cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) + for ix, mx in enumerate(x): + if mx >= 0: + matches.append([ix, mx]) + unmatched_a = np.where(x < 0)[0] + unmatched_b = np.where(y < 0)[0] + matches = np.asarray(matches) + return matches, unmatched_a, unmatched_b + + +def cython_bbox_ious(atlbrs, btlbrs): + ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) + if ious.size == 0: + return ious + import cython_bbox + ious = cython_bbox.bbox_overlaps( + np.ascontiguousarray( + atlbrs, dtype=np.float), + np.ascontiguousarray( + btlbrs, dtype=np.float)) + return ious + + +def iou_distance(atracks, btracks): + """ + Compute cost based on IoU between two list[STrack]. + """ + if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) or ( + len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): + atlbrs = atracks + btlbrs = btracks + else: + atlbrs = [track.tlbr for track in atracks] + btlbrs = [track.tlbr for track in btracks] + _ious = cython_bbox_ious(atlbrs, btlbrs) + cost_matrix = 1 - _ious + + return cost_matrix + + +def embedding_distance(tracks, detections, metric='euclidean'): + """ + Compute cost based on features between two list[STrack]. + """ + cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) + if cost_matrix.size == 0: + return cost_matrix + det_features = np.asarray( + [track.curr_feat for track in detections], dtype=np.float) + track_features = np.asarray( + [track.smooth_feat for track in tracks], dtype=np.float) + cost_matrix = np.maximum(0.0, cdist(track_features, det_features, + metric)) # Nomalized features + return cost_matrix + + +def fuse_motion(kf, + cost_matrix, + tracks, + detections, + only_position=False, + lambda_=0.98): + if cost_matrix.size == 0: + return cost_matrix + gating_dim = 2 if only_position else 4 + gating_threshold = kalman_filter.chi2inv95[gating_dim] + measurements = np.asarray([det.to_xyah() for det in detections]) + for row, track in enumerate(tracks): + gating_distance = kf.gating_distance( + track.mean, + track.covariance, + measurements, + only_position, + metric='maha') + cost_matrix[row, gating_distance > gating_threshold] = np.inf + cost_matrix[row] = lambda_ * cost_matrix[row] + (1 - lambda_ + ) * gating_distance + return cost_matrix diff --git a/ppdet/modeling/mot/motion/__init__.py b/ppdet/modeling/mot/motion/__init__.py new file mode 100644 index 000000000..e42dd0b01 --- /dev/null +++ b/ppdet/modeling/mot/motion/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import kalman_filter + +from .kalman_filter import * diff --git a/ppdet/modeling/mot/motion/kalman_filter.py b/ppdet/modeling/mot/motion/kalman_filter.py new file mode 100644 index 000000000..7ac960168 --- /dev/null +++ b/ppdet/modeling/mot/motion/kalman_filter.py @@ -0,0 +1,270 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is borrow from https://github.com/nwojke/deep_sort/blob/master/deep_sort/kalman_filter.py +""" + +import numpy as np +import scipy.linalg +from ppdet.core.workspace import register, serializable + +__all__ = ['KalmanFilter'] +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" + +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919 +} + + +@register +@serializable +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """ + Create track from unassociated measurement. + + Args: + measurement (ndarray): Bounding box coordinates (x, y, a, h) with + center position (x, y), aspect ratio a, and height h. + + Returns: + The mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are + initialized to 0 mean. + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], 1e-5, + 10 * self._std_weight_velocity * measurement[3] + ] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """ + Run Kalman filter prediction step. + + Args: + mean (ndarray): The 8 dimensional mean vector of the object state + at the previous time step. + covariance (ndarray): The 8x8 dimensional covariance matrix of the + object state at the previous time step. + + Returns: + The mean vector and covariance matrix of the predicted state. + Unobserved velocities are initialized to 0 mean. + """ + std_pos = [ + self._std_weight_position * mean[3], self._std_weight_position * + mean[3], 1e-2, self._std_weight_position * mean[3] + ] + std_vel = [ + self._std_weight_velocity * mean[3], self._std_weight_velocity * + mean[3], 1e-5, self._std_weight_velocity * mean[3] + ] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + #mean = np.dot(self._motion_mat, mean) + mean = np.dot(mean, self._motion_mat.T) + covariance = np.linalg.multi_dot( + (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """ + Project state distribution to measurement space. + + Args + mean (ndarray): The state's mean vector (8 dimensional array). + covariance (ndarray): The state's covariance matrix (8x8 dimensional). + + Returns: + The projected mean and covariance matrix of the given state estimate. + """ + std = [ + self._std_weight_position * mean[3], self._std_weight_position * + mean[3], 1e-1, self._std_weight_position * mean[3] + ] + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot((self._update_mat, covariance, + self._update_mat.T)) + return mean, covariance + innovation_cov + + def multi_predict(self, mean, covariance): + """ + Run Kalman filter prediction step (Vectorized version). + + Args: + mean (ndarray): The Nx8 dimensional mean matrix of the object states + at the previous time step. + covariance (ndarray): The Nx8x8 dimensional covariance matrics of the + object states at the previous time step. + + Returns: + The mean vector and covariance matrix of the predicted state. + Unobserved velocities are initialized to 0 mean. + """ + std_pos = [ + self._std_weight_position * mean[:, 3], self._std_weight_position * + mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]), + self._std_weight_position * mean[:, 3] + ] + std_vel = [ + self._std_weight_velocity * mean[:, 3], self._std_weight_velocity * + mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]), + self._std_weight_velocity * mean[:, 3] + ] + sqr = np.square(np.r_[std_pos, std_vel]).T + + motion_cov = [] + for i in range(len(mean)): + motion_cov.append(np.diag(sqr[i])) + motion_cov = np.asarray(motion_cov) + + mean = np.dot(mean, self._motion_mat.T) + left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2)) + covariance = np.dot(left, self._motion_mat.T) + motion_cov + + return mean, covariance + + def update(self, mean, covariance, measurement): + """ + Run Kalman filter correction step. + + Args: + mean (ndarray): The predicted state's mean vector (8 dimensional). + covariance (ndarray): The state's covariance matrix (8x8 dimensional). + measurement (ndarray): The 4 dimensional measurement vector + (x, y, a, h), where (x, y) is the center position, a the aspect + ratio, and h the height of the bounding box. + + Returns: + The measurement-corrected state distribution. + """ + projected_mean, projected_cov = self.project(mean, covariance) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), + np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot( + (kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, + mean, + covariance, + measurements, + only_position=False, + metric='maha'): + """ + Compute gating distance between state distribution and measurements. + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Args: + mean (ndarray): Mean vector over the state distribution (8 + dimensional). + covariance (ndarray): Covariance of the state distribution (8x8 + dimensional). + measurements (ndarray): An Nx4 dimensional matrix of N measurements, + each in format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position (Optional[bool]): If True, distance computation is + done with respect to the bounding box center position only. + metric (str): Metric type, 'gaussian' or 'maha'. + + Returns + An array of length N, where the i-th element contains the squared + Mahalanobis distance between (mean, covariance) and `measurements[i]`. + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + d = measurements - mean + if metric == 'gaussian': + return np.sum(d * d, axis=1) + elif metric == 'maha': + cholesky_factor = np.linalg.cholesky(covariance) + z = scipy.linalg.solve_triangular( + cholesky_factor, + d.T, + lower=True, + check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha + else: + raise ValueError('invalid distance metric') diff --git a/ppdet/modeling/mot/tracker/__init__.py b/ppdet/modeling/mot/tracker/__init__.py new file mode 100644 index 000000000..b74593b41 --- /dev/null +++ b/ppdet/modeling/mot/tracker/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import base_jde_tracker +from . import base_sde_tracker +from . import jde_tracker +from . import deepsort_tracker + +from .base_jde_tracker import * +from .base_sde_tracker import * +from .jde_tracker import * +from .deepsort_tracker import * diff --git a/ppdet/modeling/mot/tracker/base_jde_tracker.py b/ppdet/modeling/mot/tracker/base_jde_tracker.py new file mode 100644 index 000000000..abe27337d --- /dev/null +++ b/ppdet/modeling/mot/tracker/base_jde_tracker.py @@ -0,0 +1,267 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is borrow from https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py +""" + +import numpy as np +from collections import deque, OrderedDict +from ..matching import jde_matching as matching +from ppdet.core.workspace import register, serializable + +__all__ = [ + 'TrackState', + 'BaseTrack', + 'STrack', + 'joint_stracks', + 'sub_stracks', + 'remove_duplicate_stracks', +] + + +class TrackState(object): + New = 0 + Tracked = 1 + Lost = 2 + Removed = 3 + + +@register +@serializable +class BaseTrack(object): + _count = 0 + + track_id = 0 + is_activated = False + state = TrackState.New + + history = OrderedDict() + features = [] + curr_feature = None + score = 0 + start_frame = 0 + frame_id = 0 + time_since_update = 0 + + # multi-camera + location = (np.inf, np.inf) + + @property + def end_frame(self): + return self.frame_id + + @staticmethod + def next_id(): + BaseTrack._count += 1 + return BaseTrack._count + + def activate(self, *args): + raise NotImplementedError + + def predict(self): + raise NotImplementedError + + def update(self, *args, **kwargs): + raise NotImplementedError + + def mark_lost(self): + self.state = TrackState.Lost + + def mark_removed(self): + self.state = TrackState.Removed + + +@register +@serializable +class STrack(BaseTrack): + def __init__(self, tlwh, score, temp_feat, buffer_size=30): + # wait activate + self._tlwh = np.asarray(tlwh, dtype=np.float) + self.kalman_filter = None + self.mean, self.covariance = None, None + self.is_activated = False + + self.score = score + self.tracklet_len = 0 + + self.smooth_feat = None + self.update_features(temp_feat) + self.features = deque([], maxlen=buffer_size) + self.alpha = 0.9 + + def update_features(self, feat): + feat /= np.linalg.norm(feat) + self.curr_feat = feat + if self.smooth_feat is None: + self.smooth_feat = feat + else: + self.smooth_feat = self.alpha * self.smooth_feat + (1 - self.alpha + ) * feat + self.features.append(feat) + self.smooth_feat /= np.linalg.norm(self.smooth_feat) + + def predict(self): + mean_state = self.mean.copy() + if self.state != TrackState.Tracked: + mean_state[7] = 0 + self.mean, self.covariance = self.kalman_filter.predict(mean_state, + self.covariance) + + @staticmethod + def multi_predict(stracks, kalman_filter): + if len(stracks) > 0: + multi_mean = np.asarray([st.mean.copy() for st in stracks]) + multi_covariance = np.asarray([st.covariance for st in stracks]) + for i, st in enumerate(stracks): + if st.state != TrackState.Tracked: + multi_mean[i][7] = 0 + multi_mean, multi_covariance = kalman_filter.multi_predict( + multi_mean, multi_covariance) + for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): + stracks[i].mean = mean + stracks[i].covariance = cov + + def activate(self, kalman_filter, frame_id): + """Start a new tracklet""" + self.kalman_filter = kalman_filter + self.track_id = self.next_id() + self.mean, self.covariance = self.kalman_filter.initiate( + self.tlwh_to_xyah(self._tlwh)) + + self.tracklet_len = 0 + self.state = TrackState.Tracked + if frame_id == 1: + self.is_activated = True + self.frame_id = frame_id + self.start_frame = frame_id + + def re_activate(self, new_track, frame_id, new_id=False): + self.mean, self.covariance = self.kalman_filter.update( + self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh)) + + self.update_features(new_track.curr_feat) + self.tracklet_len = 0 + self.state = TrackState.Tracked + self.is_activated = True + self.frame_id = frame_id + if new_id: + self.track_id = self.next_id() + + def update(self, new_track, frame_id, update_feature=True): + self.frame_id = frame_id + self.tracklet_len += 1 + + new_tlwh = new_track.tlwh + self.mean, self.covariance = self.kalman_filter.update( + self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh)) + self.state = TrackState.Tracked + self.is_activated = True + + self.score = new_track.score + if update_feature: + self.update_features(new_track.curr_feat) + + @property + def tlwh(self): + """ + Get current position in bounding box format `(top left x, top left y, + width, height)`. + """ + if self.mean is None: + return self._tlwh.copy() + ret = self.mean[:4].copy() + ret[2] *= ret[3] + ret[:2] -= ret[2:] / 2 + return ret + + @property + def tlbr(self): + """ + Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + @staticmethod + def tlwh_to_xyah(tlwh): + """ + Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = np.asarray(tlwh).copy() + ret[:2] += ret[2:] / 2 + ret[2] /= ret[3] + return ret + + def to_xyah(self): + return self.tlwh_to_xyah(self.tlwh) + + @staticmethod + def tlbr_to_tlwh(tlbr): + ret = np.asarray(tlbr).copy() + ret[2:] -= ret[:2] + return ret + + @staticmethod + def tlwh_to_tlbr(tlwh): + ret = np.asarray(tlwh).copy() + ret[2:] += ret[:2] + return ret + + def __repr__(self): + return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, + self.end_frame) + + +def joint_stracks(tlista, tlistb): + exists = {} + res = [] + for t in tlista: + exists[t.track_id] = 1 + res.append(t) + for t in tlistb: + tid = t.track_id + if not exists.get(tid, 0): + exists[tid] = 1 + res.append(t) + return res + + +def sub_stracks(tlista, tlistb): + stracks = {} + for t in tlista: + stracks[t.track_id] = t + for t in tlistb: + tid = t.track_id + if stracks.get(tid, 0): + del stracks[tid] + return list(stracks.values()) + + +def remove_duplicate_stracks(stracksa, stracksb): + pdist = matching.iou_distance(stracksa, stracksb) + pairs = np.where(pdist < 0.15) + dupa, dupb = list(), list() + for p, q in zip(*pairs): + timep = stracksa[p].frame_id - stracksa[p].start_frame + timeq = stracksb[q].frame_id - stracksb[q].start_frame + if timep > timeq: + dupb.append(q) + else: + dupa.append(p) + resa = [t for i, t in enumerate(stracksa) if not i in dupa] + resb = [t for i, t in enumerate(stracksb) if not i in dupb] + return resa, resb diff --git a/ppdet/modeling/mot/tracker/base_sde_tracker.py b/ppdet/modeling/mot/tracker/base_sde_tracker.py new file mode 100644 index 000000000..294cf8be5 --- /dev/null +++ b/ppdet/modeling/mot/tracker/base_sde_tracker.py @@ -0,0 +1,145 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is borrow from https://github.com/nwojke/deep_sort/blob/master/deep_sort/track.py +""" + +import numpy as np +from ppdet.core.workspace import register, serializable + +__all__ = ['TrackState', 'Track'] + + +class TrackState(object): + """ + Enumeration type for the single target track state. Newly created tracks are + classified as `tentative` until enough evidence has been collected. Then, + the track state is changed to `confirmed`. Tracks that are no longer alive + are classified as `deleted` to mark them for removal from the set of active + tracks. + """ + Tentative = 1 + Confirmed = 2 + Deleted = 3 + + +@register +@serializable +class Track(object): + """ + A single target track with state space `(x, y, a, h)` and associated + velocities, where `(x, y)` is the center of the bounding box, `a` is the + aspect ratio and `h` is the height. + + Args: + mean (ndarray): Mean vector of the initial state distribution. + covariance (ndarray): Covariance matrix of the initial state distribution. + track_id (int): A unique track identifier. + n_init (int): Number of consecutive detections before the track is confirmed. + The track state is set to `Deleted` if a miss occurs within the first + `n_init` frames. + max_age (int): The maximum number of consecutive misses before the track + state is set to `Deleted`. + feature (Optional[ndarray]): Feature vector of the detection this track + originates from. If not None, this feature is added to the `features` cache. + + Attributes: + hits (int): Total number of measurement updates. + age (int): Total number of frames since first occurance. + time_since_update (int): Total number of frames since last measurement + update. + state (TrackState): The current track state. + features (List[ndarray]): A cache of features. On each measurement update, + the associated feature vector is added to this list. + """ + + def __init__(self, + mean, + covariance, + track_id, + n_init, + max_age, + feature=None): + self.mean = mean + self.covariance = covariance + self.track_id = track_id + self.hits = 1 + self.age = 1 + self.time_since_update = 0 + + self.state = TrackState.Tentative + self.features = [] + if feature is not None: + self.features.append(feature) + + self._n_init = n_init + self._max_age = max_age + + def to_tlwh(self): + """Get position in format `(top left x, top left y, width, height)`.""" + ret = self.mean[:4].copy() + ret[2] *= ret[3] + ret[:2] -= ret[2:] / 2 + return ret + + def to_tlbr(self): + """Get position in bounding box format `(min x, miny, max x, max y)`.""" + ret = self.to_tlwh() + ret[2:] = ret[:2] + ret[2:] + return ret + + def predict(self, kalman_filter): + """ + Propagate the state distribution to the current time step using a Kalman + filter prediction step. + """ + self.mean, self.covariance = kalman_filter.predict(self.mean, + self.covariance) + self.age += 1 + self.time_since_update += 1 + + def update(self, kalman_filter, detection): + """ + Perform Kalman filter measurement update step and update the associated + detection feature cache. + """ + self.mean, self.covariance = kalman_filter.update(self.mean, + self.covariance, + detection.to_xyah()) + self.features.append(detection.feature) + + self.hits += 1 + self.time_since_update = 0 + if self.state == TrackState.Tentative and self.hits >= self._n_init: + self.state = TrackState.Confirmed + + def mark_missed(self): + """Mark this track as missed (no association at the current time step). + """ + if self.state == TrackState.Tentative: + self.state = TrackState.Deleted + elif self.time_since_update > self._max_age: + self.state = TrackState.Deleted + + def is_tentative(self): + """Returns True if this track is tentative (unconfirmed).""" + return self.state == TrackState.Tentative + + def is_confirmed(self): + """Returns True if this track is confirmed.""" + return self.state == TrackState.Confirmed + + def is_deleted(self): + """Returns True if this track is dead and should be deleted.""" + return self.state == TrackState.Deleted diff --git a/ppdet/modeling/mot/tracker/deepsort_tracker.py b/ppdet/modeling/mot/tracker/deepsort_tracker.py new file mode 100644 index 000000000..128d62272 --- /dev/null +++ b/ppdet/modeling/mot/tracker/deepsort_tracker.py @@ -0,0 +1,165 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is borrow from https://github.com/nwojke/deep_sort/blob/master/deep_sort/tracker.py +""" + +import numpy as np + +from ..matching.deepsort_matching import NearestNeighborDistanceMetric +from ..matching.deepsort_matching import iou_cost, min_cost_matching, matching_cascade, gate_cost_matrix +from .base_sde_tracker import Track + +from ppdet.core.workspace import register, serializable +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['DeepSORTTracker'] + + +@register +@serializable +class DeepSORTTracker(object): + __inject__ = ['motion'] + """ + DeepSORT tracker + + Args: + img_size (list): input image size, [h, w] + budget (int): If not None, fix samples per class to at most this number. + Removes the oldest samples when the budget is reached. + max_age (int): maximum number of missed misses before a track is deleted + n_init (float): Number of frames that a track remains in initialization + phase. Number of consecutive detections before the track is confirmed. + The track state is set to `Deleted` if a miss occurs within the first + `n_init` frames. + metric_type (str): either "euclidean" or "cosine", the distance metric + used for measurement to track association. + matching_threshold (float): samples with larger distance are + considered an invalid match. + max_iou_distance (float): max iou distance threshold + motion (object): KalmanFilter instance + """ + + def __init__(self, + img_size=[608, 1088], + budget=100, + max_age=30, + n_init=3, + metric_type='cosine', + matching_threshold=0.2, + max_iou_distance=0.7, + motion='KalmanFilter'): + self.img_size = img_size + self.max_age = max_age + self.n_init = n_init + self.metric = NearestNeighborDistanceMetric(metric_type, + matching_threshold, budget) + self.max_iou_distance = max_iou_distance + self.motion = motion + + self.tracks = [] + self._next_id = 1 + + def predict(self): + """ + Propagate track state distributions one time step forward. + This function should be called once every time step, before `update`. + """ + for track in self.tracks: + track.predict(self.motion) + + def update(self, detections): + """ + Perform measurement update and track management. + Args: + detections (list): List[ppdet.modeling.mot.utils.Detection] + A list of detections at the current time step. + """ + # Run matching cascade. + matches, unmatched_tracks, unmatched_detections = \ + self._match(detections) + + # Update track set. + for track_idx, detection_idx in matches: + self.tracks[track_idx].update(self.motion, + detections[detection_idx]) + for track_idx in unmatched_tracks: + self.tracks[track_idx].mark_missed() + for detection_idx in unmatched_detections: + self._initiate_track(detections[detection_idx]) + self.tracks = [t for t in self.tracks if not t.is_deleted()] + + # Update distance metric. + active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] + features, targets = [], [] + for track in self.tracks: + if not track.is_confirmed(): + continue + features += track.features + targets += [track.track_id for _ in track.features] + track.features = [] + self.metric.partial_fit( + np.asarray(features), np.asarray(targets), active_targets) + output_stracks = self.tracks + return output_stracks + + def _match(self, detections): + def gated_metric(tracks, dets, track_indices, detection_indices): + features = np.array([dets[i].feature for i in detection_indices]) + targets = np.array([tracks[i].track_id for i in track_indices]) + cost_matrix = self.metric.distance(features, targets) + cost_matrix = gate_cost_matrix(self.motion, cost_matrix, tracks, + dets, track_indices, + detection_indices) + return cost_matrix + + # Split track set into confirmed and unconfirmed tracks. + confirmed_tracks = [ + i for i, t in enumerate(self.tracks) if t.is_confirmed() + ] + unconfirmed_tracks = [ + i for i, t in enumerate(self.tracks) if not t.is_confirmed() + ] + + # Associate confirmed tracks using appearance features. + matches_a, unmatched_tracks_a, unmatched_detections = \ + matching_cascade( + gated_metric, self.metric.matching_threshold, self.max_age, + self.tracks, detections, confirmed_tracks) + + # Associate remaining tracks together with unconfirmed tracks using IOU. + iou_track_candidates = unconfirmed_tracks + [ + k for k in unmatched_tracks_a + if self.tracks[k].time_since_update == 1 + ] + unmatched_tracks_a = [ + k for k in unmatched_tracks_a + if self.tracks[k].time_since_update != 1 + ] + matches_b, unmatched_tracks_b, unmatched_detections = \ + min_cost_matching( + iou_cost, self.max_iou_distance, self.tracks, + detections, iou_track_candidates, unmatched_detections) + + matches = matches_a + matches_b + unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) + return matches, unmatched_tracks, unmatched_detections + + def _initiate_track(self, detection): + mean, covariance = self.motion.initiate(detection.to_xyah()) + self.tracks.append( + Track(mean, covariance, self._next_id, self.n_init, self.max_age, + detection.feature)) + self._next_id += 1 diff --git a/ppdet/modeling/mot/tracker/jde_tracker.py b/ppdet/modeling/mot/tracker/jde_tracker.py new file mode 100644 index 000000000..b0162f7b2 --- /dev/null +++ b/ppdet/modeling/mot/tracker/jde_tracker.py @@ -0,0 +1,244 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is borrow from https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py +""" + +import paddle + +from ..matching import jde_matching as matching +from .base_jde_tracker import TrackState, BaseTrack, STrack +from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks + +from ppdet.core.workspace import register, serializable +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['JDETracker'] + + +@register +@serializable +class JDETracker(object): + __inject__ = ['motion'] + """ + JDE tracker + + Args: + det_thresh (float): threshold of detection score + track_buffer (int): buffer for tracker + min_box_area (int): min box area to filter out low quality boxes + tracked_thresh (float): linear assignment threshold of tracked + stracks and detections + r_tracked_thresh (float): linear assignment threshold of + tracked stracks and unmatched detections + unconfirmed_thresh (float): linear assignment threshold of + unconfirmed stracks and unmatched detections + motion (object): KalmanFilter instance + """ + + def __init__(self, + det_thresh=0.3, + track_buffer=30, + min_box_area=200, + tracked_thresh=0.7, + r_tracked_thresh=0.5, + unconfirmed_thresh=0.7, + motion='KalmanFilter'): + self.det_thresh = det_thresh + self.track_buffer = track_buffer + self.min_box_area = min_box_area + self.tracked_thresh = tracked_thresh + self.r_tracked_thresh = r_tracked_thresh + self.unconfirmed_thresh = unconfirmed_thresh + self.motion = motion + + self.frame_id = 0 + self.tracked_stracks = [] + self.lost_stracks = [] + self.removed_stracks = [] + + self.max_time_lost = 0 + # max_time_lost will be calculated: int(frame_rate / 30.0 * track_buffer) + + def update(self, pred_dets, pred_embs): + """ + Processes the image frame and finds bounding box(detections). + Associates the detection with corresponding tracklets and also handles + lost, removed, refound and active tracklets. + + Args: + pred_dets (Tensor): Detection results of the image, shape is [N, 5]. + pred_embs (Tensor): Embedding results of the image, shape is [N, 512]. + + Return: + output_stracks (list): The list contains information regarding the + online_tracklets for the recieved image tensor. + """ + self.frame_id += 1 + activated_starcks = [] + # for storing active tracks, for the current frame + refind_stracks = [] + # Lost Tracks whose detections are obtained in the current frame + lost_stracks = [] + # The tracks which are not obtained in the current frame but are not + # removed. (Lost for some time lesser than the threshold for removing) + removed_stracks = [] + + # Filter out the image with box_num = 0. pred_dets = [[0.0, 0.0, 0.0 ,0.0]] + empty_pred = True if len(pred_dets) == 1 and paddle.sum( + pred_dets) == 0.0 else False + """ Step 1: Network forward, get detections & embeddings""" + if len(pred_dets) > 0 and not empty_pred: + pred_dets = pred_dets.numpy() + pred_embs = pred_embs.numpy() + detections = [ + STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f, 30) + for (tlbrs, f) in zip(pred_dets, pred_embs) + ] + else: + detections = [] + ''' Add newly detected tracklets to tracked_stracks''' + unconfirmed = [] + tracked_stracks = [] # type: list[STrack] + for track in self.tracked_stracks: + if not track.is_activated: + # previous tracks which are not active in the current frame are added in unconfirmed list + unconfirmed.append(track) + else: + # Active tracks are added to the local list 'tracked_stracks' + tracked_stracks.append(track) + """ Step 2: First association, with embedding""" + # Combining currently tracked_stracks and lost_stracks + strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) + # Predict the current location with KF + STrack.multi_predict(strack_pool, self.motion) + + dists = matching.embedding_distance(strack_pool, detections) + dists = matching.fuse_motion(self.motion, dists, strack_pool, + detections) + # The dists is the list of distances of the detection with the tracks in strack_pool + matches, u_track, u_detection = matching.linear_assignment( + dists, thresh=self.tracked_thresh) + # The matches is the array for corresponding matches of the detection with the corresponding strack_pool + + for itracked, idet in matches: + # itracked is the id of the track and idet is the detection + track = strack_pool[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + # If the track is active, add the detection to the track + track.update(detections[idet], self.frame_id) + activated_starcks.append(track) + else: + # We have obtained a detection from a track which is not active, + # hence put the track in refind_stracks list + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + + # None of the steps below happen if there are no undetected tracks. + """ Step 3: Second association, with IOU""" + detections = [detections[i] for i in u_detection] + # detections is now a list of the unmatched detections + r_tracked_stracks = [] + # This is container for stracks which were tracked till the previous + # frame but no detection was found for it in the current frame. + + for i in u_track: + if strack_pool[i].state == TrackState.Tracked: + r_tracked_stracks.append(strack_pool[i]) + dists = matching.iou_distance(r_tracked_stracks, detections) + matches, u_track, u_detection = matching.linear_assignment( + dists, thresh=self.r_tracked_thresh) + # matches is the list of detections which matched with corresponding + # tracks by IOU distance method. + + for itracked, idet in matches: + track = r_tracked_stracks[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(det, self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + # Same process done for some unmatched detections, but now considering IOU_distance as measure + + for it in u_track: + track = r_tracked_stracks[it] + if not track.state == TrackState.Lost: + track.mark_lost() + lost_stracks.append(track) + # If no detections are obtained for tracks (u_track), the tracks are added to lost_tracks list and are marked lost + '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' + detections = [detections[i] for i in u_detection] + dists = matching.iou_distance(unconfirmed, detections) + matches, u_unconfirmed, u_detection = matching.linear_assignment( + dists, thresh=self.unconfirmed_thresh) + for itracked, idet in matches: + unconfirmed[itracked].update(detections[idet], self.frame_id) + activated_starcks.append(unconfirmed[itracked]) + + # The tracks which are yet not matched + for it in u_unconfirmed: + track = unconfirmed[it] + track.mark_removed() + removed_stracks.append(track) + + # after all these confirmation steps, if a new detection is found, it is initialized for a new track + """ Step 4: Init new stracks""" + for inew in u_detection: + track = detections[inew] + if track.score < self.det_thresh: + continue + track.activate(self.motion, self.frame_id) + activated_starcks.append(track) + """ Step 5: Update state""" + # If the tracks are lost for more frames than the threshold number, the tracks are removed. + for track in self.lost_stracks: + if self.frame_id - track.end_frame > self.max_time_lost: + track.mark_removed() + removed_stracks.append(track) + + # Update the self.tracked_stracks and self.lost_stracks using the updates in this step. + self.tracked_stracks = [ + t for t in self.tracked_stracks if t.state == TrackState.Tracked + ] + self.tracked_stracks = joint_stracks(self.tracked_stracks, + activated_starcks) + self.tracked_stracks = joint_stracks(self.tracked_stracks, + refind_stracks) + + self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) + self.lost_stracks.extend(lost_stracks) + self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) + self.removed_stracks.extend(removed_stracks) + self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks( + self.tracked_stracks, self.lost_stracks) + # get scores of lost tracks + output_stracks = [ + track for track in self.tracked_stracks if track.is_activated + ] + + logger.debug('===========Frame {}=========='.format(self.frame_id)) + logger.debug('Activated: {}'.format( + [track.track_id for track in activated_starcks])) + logger.debug('Refind: {}'.format( + [track.track_id for track in refind_stracks])) + logger.debug('Lost: {}'.format( + [track.track_id for track in lost_stracks])) + logger.debug('Removed: {}'.format( + [track.track_id for track in removed_stracks])) + + return output_stracks diff --git a/ppdet/modeling/mot/utils.py b/ppdet/modeling/mot/utils.py new file mode 100644 index 000000000..eff8d472f --- /dev/null +++ b/ppdet/modeling/mot/utils.py @@ -0,0 +1,181 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import time +import paddle +import numpy as np + +__all__ = [ + 'Timer', + 'Detection', + 'load_det_results', + 'preprocess_reid', + 'get_crops', + 'clip_box', + 'scale_coords', +] + + +class Timer(object): + """ + This class used to compute and print the current FPS while evaling. + """ + + def __init__(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. + self.duration = 0. + + def tic(self): + # using time.time instead of time.clock because time time.clock + # does not normalize for multithreading + self.start_time = time.time() + + def toc(self, average=True): + self.diff = time.time() - self.start_time + self.total_time += self.diff + self.calls += 1 + self.average_time = self.total_time / self.calls + if average: + self.duration = self.average_time + else: + self.duration = self.diff + return self.duration + + def clear(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. + self.duration = 0. + + +class Detection(object): + """ + This class represents a bounding box detection in a single image. + + Args: + tlwh (ndarray): Bounding box in format `(top left x, top left y, + width, height)`. + confidence (ndarray): Detector confidence score. + feature (Tensor): A feature vector that describes the object + contained in this image. + """ + + def __init__(self, tlwh, confidence, feature): + self.tlwh = np.asarray(tlwh, dtype=np.float32) + self.confidence = np.asarray(confidence, dtype=np.float32) + self.feature = feature.numpy() + + def to_tlbr(self): + """ + Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + def to_xyah(self): + """ + Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = self.tlwh.copy() + ret[:2] += ret[2:] / 2 + ret[2] /= ret[3] + return ret + + +def load_det_results(det_file, num_frames): + assert os.path.exists(det_file) and os.path.isfile(det_file), \ + 'Error: det_file: {} not exist or not a file.'.format(det_file) + labels = np.loadtxt(det_file, dtype='float32', delimiter=',') + results_list = [] + for frame_i in range(0, num_frames): + results = {'bbox': [], 'score': []} + lables_with_frame = labels[labels[:, 0] == frame_i + 1] + for l in lables_with_frame: + results['bbox'].append(l[2:6]) + results['score'].append(l[6]) + results_list.append(results) + return results_list + + +def scale_coords(coords, input_shape, im_shape, scale_factor): + im_shape = im_shape.numpy()[0] + ratio = scale_factor.numpy()[0][0] + img0_shape = [int(im_shape[0] / ratio), int(im_shape[1] / ratio)] + + pad_w = (input_shape[1] - img0_shape[1] * ratio) / 2 + pad_h = (input_shape[0] - img0_shape[0] * ratio) / 2 + coords[:, 0::2] -= pad_w + coords[:, 1::2] -= pad_h + coords[:, 0:4] /= paddle.to_tensor(ratio) + coords[:, :4] = paddle.clip(coords[:, :4], min=0, max=coords[:, :4].max()) + return coords.round() + + +def clip_box(xyxy, input_shape, im_shape, scale_factor): + im_shape = im_shape.numpy()[0] + ratio = scale_factor.numpy()[0][0] + img0_shape = [int(im_shape[0] / ratio), int(im_shape[1] / ratio)] + + xyxy[:, 0::2] = paddle.clip(xyxy[:, 0::2], min=0, max=img0_shape[1]) + xyxy[:, 1::2] = paddle.clip(xyxy[:, 1::2], min=0, max=img0_shape[0]) + return xyxy + + +def get_crops(xyxy, ori_img, pred_scores, w, h): + crops = [] + keep_scores = [] + xyxy = xyxy.numpy().astype(np.int64) + ori_img = ori_img.numpy() + ori_img = np.squeeze(ori_img, axis=0).transpose(1, 0, 2) + pred_scores = pred_scores.numpy() + for i, bbox in enumerate(xyxy): + if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]: + continue + crop = ori_img[bbox[0]:bbox[2], bbox[1]:bbox[3], :] + crops.append(crop) + keep_scores.append(pred_scores[i]) + if len(crops) == 0: + return [], [] + crops = preprocess_reid(crops, w, h) + return crops, keep_scores + + +def preprocess_reid(imgs, + w=64, + h=192, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]): + im_batch = [] + for img in imgs: + img = cv2.resize(img, (w, h)) + img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255 + img_mean = np.array(mean).reshape((3, 1, 1)) + img_std = np.array(std).reshape((3, 1, 1)) + img -= img_mean + img /= img_std + img = np.expand_dims(img, axis=0) + im_batch.append(img) + im_batch = np.concatenate(im_batch, 0) + return im_batch diff --git a/ppdet/modeling/mot/visualization.py b/ppdet/modeling/mot/visualization.py new file mode 100644 index 000000000..74b1f94e3 --- /dev/null +++ b/ppdet/modeling/mot/visualization.py @@ -0,0 +1,126 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import numpy as np + + +def tlwhs_to_tlbrs(tlwhs): + tlbrs = np.copy(tlwhs) + if len(tlbrs) == 0: + return tlbrs + tlbrs[:, 2] += tlwhs[:, 0] + tlbrs[:, 3] += tlwhs[:, 1] + return tlbrs + + +def get_color(idx): + idx = idx * 3 + color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255) + return color + + +def resize_image(image, max_size=800): + if max(image.shape[:2]) > max_size: + scale = float(max_size) / max(image.shape[:2]) + image = cv2.resize(image, None, fx=scale, fy=scale) + return image + + +def plot_tracking(image, + tlwhs, + obj_ids, + scores=None, + frame_id=0, + fps=0., + ids2=None): + im = np.ascontiguousarray(np.copy(image)) + im_h, im_w = im.shape[:2] + + top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 + + text_scale = max(1, image.shape[1] / 1600.) + text_thickness = 2 + line_thickness = max(1, int(image.shape[1] / 500.)) + + radius = max(5, int(im_w / 140.)) + cv2.putText( + im, + 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), + (0, int(15 * text_scale)), + cv2.FONT_HERSHEY_PLAIN, + text_scale, (0, 0, 255), + thickness=2) + + for i, tlwh in enumerate(tlwhs): + x1, y1, w, h = tlwh + intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) + obj_id = int(obj_ids[i]) + id_text = '{}'.format(int(obj_id)) + if ids2 is not None: + id_text = id_text + ', {}'.format(int(ids2[i])) + _line_thickness = 1 if obj_id <= 0 else line_thickness + color = get_color(abs(obj_id)) + cv2.rectangle( + im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) + cv2.putText( + im, + id_text, (intbox[0], intbox[1] + 30), + cv2.FONT_HERSHEY_PLAIN, + text_scale, (0, 0, 255), + thickness=text_thickness) + return im + + +def plot_trajectory(image, tlwhs, track_ids): + image = image.copy() + for one_tlwhs, track_id in zip(tlwhs, track_ids): + color = get_color(int(track_id)) + for tlwh in one_tlwhs: + x1, y1, w, h = tuple(map(int, tlwh)) + cv2.circle( + image, (int(x1 + 0.5 * w), int(y1 + h)), 2, color, thickness=2) + return image + + +def plot_detections(image, tlbrs, scores=None, color=(255, 0, 0), ids=None): + im = np.copy(image) + text_scale = max(1, image.shape[1] / 800.) + thickness = 2 if text_scale > 1.3 else 1 + for i, det in enumerate(tlbrs): + x1, y1, x2, y2 = np.asarray(det[:4], dtype=np.int) + if len(det) >= 7: + label = 'det' if det[5] > 0 else 'trk' + if ids is not None: + text = '{}# {:.2f}: {:d}'.format(label, det[6], ids[i]) + cv2.putText( + im, + text, (x1, y1 + 30), + cv2.FONT_HERSHEY_PLAIN, + text_scale, (0, 255, 255), + thickness=thickness) + else: + text = '{}# {:.2f}'.format(label, det[6]) + + if scores is not None: + text = '{:.2f}'.format(scores[i]) + cv2.putText( + im, + text, (x1, y1 + 30), + cv2.FONT_HERSHEY_PLAIN, + text_scale, (0, 255, 255), + thickness=thickness) + + cv2.rectangle(im, (x1, y1), (x2, y2), color, 2) + return im diff --git a/ppdet/modeling/necks/yolo_fpn.py b/ppdet/modeling/necks/yolo_fpn.py index 25458974a..3e65a7db8 100644 --- a/ppdet/modeling/necks/yolo_fpn.py +++ b/ppdet/modeling/necks/yolo_fpn.py @@ -52,7 +52,13 @@ def add_coord(x, data_format): class YoloDetBlock(nn.Layer): - def __init__(self, ch_in, channel, norm_type, name, data_format='NCHW'): + def __init__(self, + ch_in, + channel, + norm_type, + freeze_norm=False, + name='', + data_format='NCHW'): """ YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767 @@ -60,6 +66,7 @@ class YoloDetBlock(nn.Layer): ch_in (int): input channel channel (int): base channel norm_type (str): batch norm type + freeze_norm (bool): whether to freeze norm, default False name (str): layer name data_format (str): data format, NCHW or NHWC """ @@ -87,6 +94,7 @@ class YoloDetBlock(nn.Layer): filter_size=filter_size, padding=(filter_size - 1) // 2, norm_type=norm_type, + freeze_norm=freeze_norm, data_format=data_format, name=name + post_name)) @@ -96,6 +104,7 @@ class YoloDetBlock(nn.Layer): filter_size=3, padding=1, norm_type=norm_type, + freeze_norm=freeze_norm, data_format=data_format, name=name + '.tip') @@ -112,7 +121,8 @@ class SPP(nn.Layer): k, pool_size, norm_type, - name, + freeze_norm=False, + name='', act='leaky', data_format='NCHW'): """ @@ -123,7 +133,9 @@ class SPP(nn.Layer): ch_out (int): output channel of conv layer k (int): kernel size of conv layer norm_type (str): batch norm type + freeze_norm (bool): whether to freeze norm, default False name (str): layer name + act (str): activation function data_format (str): data format, NCHW or NHWC """ super(SPP, self).__init__() @@ -145,6 +157,7 @@ class SPP(nn.Layer): k, padding=k // 2, norm_type=norm_type, + freeze_norm=freeze_norm, name=name, act=act, data_format=data_format) @@ -210,7 +223,8 @@ class CoordConv(nn.Layer): filter_size, padding, norm_type, - name, + freeze_norm=False, + name='', data_format='NCHW'): """ CoordConv layer @@ -232,6 +246,7 @@ class CoordConv(nn.Layer): filter_size=filter_size, padding=padding, norm_type=norm_type, + freeze_norm=freeze_norm, data_format=data_format, name=name) self.data_format = data_format @@ -419,6 +434,7 @@ class YOLOv3FPN(nn.Layer): def __init__(self, in_channels=[256, 512, 1024], norm_type='bn', + freeze_norm=False, data_format='NCHW'): """ YOLOv3FPN layer @@ -449,6 +465,7 @@ class YOLOv3FPN(nn.Layer): in_channel, channel=512 // (2**i), norm_type=norm_type, + freeze_norm=freeze_norm, data_format=data_format, name=name)) self.yolo_blocks.append(yolo_block) @@ -466,14 +483,17 @@ class YOLOv3FPN(nn.Layer): stride=1, padding=0, norm_type=norm_type, + freeze_norm=freeze_norm, data_format=data_format, name=name)) self.routes.append(route) - def forward(self, blocks): + def forward(self, blocks, for_mot=False): assert len(blocks) == self.num_blocks blocks = blocks[::-1] yolo_feats = [] + if for_mot: + emb_feats = [] for i, block in enumerate(blocks): if i > 0: if self.data_format == 'NCHW': @@ -483,12 +503,19 @@ class YOLOv3FPN(nn.Layer): route, tip = self.yolo_blocks[i](block) yolo_feats.append(tip) + if for_mot: + # add emb_feats output + emb_feats.append(route) + if i < self.num_blocks - 1: route = self.routes[i](route) route = F.interpolate( route, scale_factor=2., data_format=self.data_format) - return yolo_feats + if for_mot: + return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats} + else: + return yolo_feats @classmethod def from_config(cls, cfg, input_shape): @@ -507,6 +534,7 @@ class PPYOLOFPN(nn.Layer): def __init__(self, in_channels=[512, 1024, 2048], norm_type='bn', + freeze_norm=False, data_format='NCHW', coord_conv=False, conv_block_num=2, @@ -568,22 +596,26 @@ class PPYOLOFPN(nn.Layer): [ 'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1], dict( - padding=0, norm_type=norm_type) + padding=0, + norm_type=norm_type, + freeze_norm=freeze_norm) ], [ 'conv{}'.format(2 * j + 1), ConvBNLayer, [c_out, c_out * 2, 3], dict( - padding=1, norm_type=norm_type) + padding=1, + norm_type=norm_type, + freeze_norm=freeze_norm) ], ] c_in, c_out = c_out * 2, c_out base_cfg += [[ 'route', ConvLayer, [c_in, c_out, 1], dict( - padding=0, norm_type=norm_type) + padding=0, norm_type=norm_type, freeze_norm=freeze_norm) ], [ 'tip', ConvLayer, [c_out, c_out * 2, 3], dict( - padding=1, norm_type=norm_type) + padding=1, norm_type=norm_type, freeze_norm=freeze_norm) ]] if self.conv_block_num == 2: @@ -591,7 +623,9 @@ class PPYOLOFPN(nn.Layer): if self.spp: spp_cfg = [[ 'spp', SPP, [channel * 4, channel, 1], dict( - pool_size=[5, 9, 13], norm_type=norm_type) + pool_size=[5, 9, 13], + norm_type=norm_type, + freeze_norm=freeze_norm) ]] else: spp_cfg = [] @@ -603,7 +637,9 @@ class PPYOLOFPN(nn.Layer): if self.spp and i == 0: spp_cfg = [[ 'spp', SPP, [c_in * 4, c_in, 1], dict( - pool_size=[5, 9, 13], norm_type=norm_type) + pool_size=[5, 9, 13], + norm_type=norm_type, + freeze_norm=freeze_norm) ]] else: spp_cfg = [] @@ -623,14 +659,17 @@ class PPYOLOFPN(nn.Layer): stride=1, padding=0, norm_type=norm_type, + freeze_norm=freeze_norm, data_format=data_format, name=name)) self.routes.append(route) - def forward(self, blocks): + def forward(self, blocks, for_mot=False): assert len(blocks) == self.num_blocks blocks = blocks[::-1] yolo_feats = [] + if for_mot: + emb_feats = [] for i, block in enumerate(blocks): if i > 0: if self.data_format == 'NCHW': @@ -640,12 +679,19 @@ class PPYOLOFPN(nn.Layer): route, tip = self.yolo_blocks[i](block) yolo_feats.append(tip) + if for_mot: + # add emb_feats output + emb_feats.append(route) + if i < self.num_blocks - 1: route = self.routes[i](route) route = F.interpolate( route, scale_factor=2., data_format=self.data_format) - return yolo_feats + if for_mot: + return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats} + else: + return yolo_feats @classmethod def from_config(cls, cfg, input_shape): diff --git a/ppdet/modeling/ops.py b/ppdet/modeling/ops.py index f190a4895..dc47d1294 100644 --- a/ppdet/modeling/ops.py +++ b/ppdet/modeling/ops.py @@ -52,6 +52,7 @@ def mish(x): def batch_norm(ch, norm_type='bn', norm_decay=0., + freeze_norm=False, initializer=None, data_format='NCHW'): if norm_type == 'sync_bn': @@ -59,13 +60,30 @@ def batch_norm(ch, else: batch_norm = nn.BatchNorm2D - return batch_norm( + norm_lr = 0. if freeze_norm else 1. + weight_attr = ParamAttr( + initializer=initializer, + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + + norm_layer = batch_norm( ch, - weight_attr=ParamAttr( - initializer=initializer, regularizer=L2Decay(norm_decay)), - bias_attr=ParamAttr(regularizer=L2Decay(norm_decay)), + weight_attr=weight_attr, + bias_attr=bias_attr, data_format=data_format) + norm_params = norm_layer.parameters() + if freeze_norm: + for param in norm_params: + param.stop_gradient = True + + return norm_layer + @paddle.jit.not_to_static def roi_pool(input, diff --git a/ppdet/modeling/post_process.py b/ppdet/modeling/post_process.py index 091994092..b5dfc725a 100644 --- a/ppdet/modeling/post_process.py +++ b/ppdet/modeling/post_process.py @@ -18,13 +18,18 @@ import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.bbox_utils import nonempty_bbox, rbox2poly, pd_rbox2poly -from . import ops try: from collections.abc import Sequence except Exception: from collections import Sequence -__all__ = ['BBoxPostProcess', 'MaskPostProcess', 'FCOSPostProcess'] +__all__ = [ + 'BBoxPostProcess', + 'MaskPostProcess', + 'FCOSPostProcess', + 'S2ANetBBoxPostProcess', + 'JDEBBoxPostProcess', +] @register @@ -307,3 +312,33 @@ class S2ANetBBoxPostProcess(object): pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1) pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1) return pred_result + + +@register +class JDEBBoxPostProcess(BBoxPostProcess): + def __call__(self, head_out, anchors): + """ + Decode the bbox and do NMS for JDE model. + + Args: + head_out (list): Bbox_pred and cls_prob of bbox_head output. + anchors (list): Anchors of JDE model. + + Returns: + boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. + bbox_pred (Tensor): The output is the prediction with shape [N, 6] + including labels, scores and bboxes. + bbox_num (Tensor): The number of prediction of each batch with shape [N]. + nms_keep_idx (Tensor): The index of kept bboxes after NMS. + """ + boxes_idx, bboxes, score = self.decode(head_out, anchors) + bbox_pred, bbox_num, nms_keep_idx = self.nms(bboxes, score, + self.num_classes) + if bbox_pred.shape[0] == 0: + bbox_pred = paddle.to_tensor( + np.array( + [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32')) + bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) + nms_keep_idx = paddle.to_tensor(np.array([[0]], dtype='int32')) + + return boxes_idx, bbox_pred, bbox_num, nms_keep_idx diff --git a/ppdet/modeling/reid/__init__.py b/ppdet/modeling/reid/__init__.py new file mode 100644 index 000000000..a000c9532 --- /dev/null +++ b/ppdet/modeling/reid/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import jde_embedding_head +from . import pyramidal_embedding +from . import resnet + +from .jde_embedding_head import * +from .pyramidal_embedding import * +from .resnet import * diff --git a/ppdet/modeling/reid/jde_embedding_head.py b/ppdet/modeling/reid/jde_embedding_head.py new file mode 100644 index 000000000..5b108387a --- /dev/null +++ b/ppdet/modeling/reid/jde_embedding_head.py @@ -0,0 +1,187 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register +from paddle.nn.initializer import Normal, Constant + +__all__ = ['JDEEmbeddingHead'] + + +class LossParam(nn.Layer): + def __init__(self, init_value=0., use_uncertainy=True): + super(LossParam, self).__init__() + self.loss_param = self.create_parameter( + shape=[1], + attr=ParamAttr(initializer=Constant(value=init_value)), + dtype="float32") + + def forward(self, inputs): + out = paddle.exp(-self.loss_param) * inputs + self.loss_param + return out * 0.5 + + +@register +class JDEEmbeddingHead(nn.Layer): + __shared__ = ['num_classes'] + __inject__ = ['emb_loss', 'jde_loss'] + """ + JDEEmbeddingHead + Args: + num_classes(int): Number of classes. Only support one class tracking. + num_identifiers(int): Number of identifiers. + anchor_levels(int): Number of anchor levels, same as FPN levels. + anchor_scales(int): Number of anchor scales on each FPN level. + embedding_dim(int): Embedding dimension. Default: 512. + emb_loss(object): Instance of 'JDEEmbeddingLoss' + jde_loss(object): Instance of 'JDELoss' + """ + + def __init__( + self, + num_classes=1, + num_identifiers=1, # defined by dataset.total_identities + anchor_levels=3, + anchor_scales=4, + embedding_dim=512, + emb_loss='JDEEmbeddingLoss', + jde_loss='JDELoss'): + super(JDEEmbeddingHead, self).__init__() + self.num_classes = num_classes + self.num_identifiers = num_identifiers + self.anchor_levels = anchor_levels + self.anchor_scales = anchor_scales + self.embedding_dim = embedding_dim + self.emb_loss = emb_loss + self.jde_loss = jde_loss + + self.emb_scale = math.sqrt(2) * math.log( + self.num_identifiers - 1) if self.num_identifiers > 1 else 1 + + self.identify_outputs = [] + self.loss_params_cls = [] + self.loss_params_reg = [] + self.loss_params_ide = [] + for i in range(self.anchor_levels): + name = 'identify_output.{}'.format(i) + identify_output = self.add_sublayer( + name, + nn.Conv2D( + in_channels=64 * (2**self.anchor_levels) // (2**i), + out_channels=self.embedding_dim, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(name=name + '.conv.weights'), + bias_attr=ParamAttr( + name=name + '.conv.bias', regularizer=L2Decay(0.)))) + self.identify_outputs.append(identify_output) + + loss_p_cls = self.add_sublayer('cls.{}'.format(i), LossParam(-4.15)) + self.loss_params_cls.append(loss_p_cls) + loss_p_reg = self.add_sublayer('reg.{}'.format(i), LossParam(-4.85)) + self.loss_params_reg.append(loss_p_reg) + loss_p_ide = self.add_sublayer('ide.{}'.format(i), LossParam(-2.3)) + self.loss_params_ide.append(loss_p_ide) + + self.classifier = self.add_sublayer( + 'classifier', + nn.Linear( + self.embedding_dim, + self.num_identifiers, + weight_attr=ParamAttr( + learning_rate=1., initializer=Normal( + mean=0.0, std=0.01)), + bias_attr=ParamAttr( + learning_rate=2., regularizer=L2Decay(0.)))) + + def forward(self, + identify_feats, + targets=None, + loss_confs=None, + loss_boxes=None, + test_emb=False): + assert len(identify_feats) == self.anchor_levels + ide_outs = [] + for feat, ide_head in zip(identify_feats, self.identify_outputs): + ide_outs.append(ide_head(feat)) + + if self.training: + assert targets != None + assert len(loss_confs) == len(loss_boxes) == self.anchor_levels + loss_ides = self.emb_loss(ide_outs, targets, self.emb_scale, + self.classifier) + return self.jde_loss(loss_confs, loss_boxes, loss_ides, + self.loss_params_cls, self.loss_params_reg, + self.loss_params_ide, targets) + else: + if test_emb: + assert targets != None + embs_and_gts = self.get_emb_and_gt_outs(ide_outs, targets) + return embs_and_gts + else: + emb_outs = self.get_emb_outs(ide_outs) + return emb_outs + + def get_emb_and_gt_outs(self, ide_outs, targets): + emb_and_gts = [] + for i, p_ide in enumerate(ide_outs): + t_conf = targets['tconf{}'.format(i)] + t_ide = targets['tide{}'.format(i)] + + p_ide = p_ide.transpose((0, 2, 3, 1)) + p_ide_flatten = paddle.reshape(p_ide, [-1, self.embedding_dim]) + + mask = t_conf > 0 + mask = paddle.cast(mask, dtype="int64") + emb_mask = mask.max(1).flatten() + emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten() + if len(emb_mask_inds) > 0: + t_ide_flatten = paddle.reshape(t_ide.max(1), [-1, 1]) + tids = paddle.gather(t_ide_flatten, emb_mask_inds) + + embedding = paddle.gather(p_ide_flatten, emb_mask_inds) + embedding = self.emb_scale * F.normalize(embedding) + emb_and_gt = paddle.concat([embedding, tids], axis=1) + emb_and_gts.append(emb_and_gt) + + if len(emb_and_gts) > 0: + return paddle.concat(emb_and_gts, axis=0) + else: + return paddle.zeros((1, self.embedding_dim + 1)) + + def get_emb_outs(self, ide_outs): + emb_outs = [] + for i, p_ide in enumerate(ide_outs): + p_ide = p_ide.transpose((0, 2, 3, 1)) + + p_ide_repeat = paddle.tile( + p_ide.unsqueeze(axis=0), [1, self.anchor_scales, 1, 1, 1]) + embedding = F.normalize(p_ide_repeat, axis=-1) + emb = paddle.reshape(embedding, [-1, self.embedding_dim]) + emb_outs.append(emb) + + if len(emb_outs) > 0: + return paddle.concat(emb_outs, axis=0) + else: + return paddle.zeros((1, self.embedding_dim)) diff --git a/ppdet/modeling/reid/pyramidal_embedding.py b/ppdet/modeling/reid/pyramidal_embedding.py new file mode 100644 index 000000000..e7520f7fa --- /dev/null +++ b/ppdet/modeling/reid/pyramidal_embedding.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Normal, Constant +from paddle import ParamAttr +from .resnet import * +from ppdet.core.workspace import register + +__all__ = ['PCBPlusDropoutPyramid'] + + +@register +class PCBPlusDropoutPyramid(nn.Layer): + def __init__( + self, + input_ch=2048, + num_stripes=6, # number of sub-parts + used_levels=(1, 1, 1, 1, 1, 1), + num_classes=751, + last_conv_stride=1, + last_conv_dilation=1, + num_conv_out_channels=128): + super(PCBPlusDropoutPyramid, self).__init__() + self.num_stripes = num_stripes + self.used_levels = used_levels + self.num_classes = num_classes + + self.num_in_each_level = [i for i in range(self.num_stripes, 0, -1)] + self.num_branches = sum(self.num_in_each_level) + + self.base = ResNet101( + lr_mult=0.1, + last_conv_stride=last_conv_stride, + last_conv_dilation=last_conv_dilation) + self.dropout_layer = nn.Dropout(p=0.2) + self.pyramid_conv_list0, self.pyramid_fc_list0 = self.basic_branch( + num_conv_out_channels, input_ch) + + def basic_branch(self, num_conv_out_channels, input_ch): + # the level indexes are defined from fine to coarse, + # the branch will contain one more part than that of its previous level + # the sliding step is set to 1 + pyramid_conv_list = nn.LayerList() + pyramid_fc_list = nn.LayerList() + + idx_levels = 0 + for idx_branches in range(self.num_branches): + if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): + idx_levels += 1 + if self.used_levels[idx_levels] == 0: + continue + pyramid_conv_list.append( + nn.Sequential( + nn.Conv2D(input_ch, num_conv_out_channels, 1), + nn.BatchNorm2D(num_conv_out_channels), nn.ReLU())) + + idx_levels = 0 + for idx_branches in range(self.num_branches): + if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): + idx_levels += 1 + if self.used_levels[idx_levels] == 0: + continue + name = "Linear_branch_id_{}".format(idx_branches) + fc = nn.Linear( + in_features=num_conv_out_channels, + out_features=self.num_classes, + weight_attr=ParamAttr( + name=name + "_weights", + initializer=Normal( + mean=0., std=0.001)), + bias_attr=ParamAttr( + name=name + "_bias", initializer=Constant(value=0.))) + pyramid_fc_list.append(fc) + return pyramid_conv_list, pyramid_fc_list + + def pyramid_forward(self, feat): + each_stripe_size = int(feat.shape[2] / self.num_stripes) + + feat_list, logits_list = [], [] + idx_levels = 0 + used_branches = 0 + for idx_branches in range(self.num_branches): + if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): + idx_levels += 1 + if self.used_levels[idx_levels] == 0: + continue + idx_in_each_level = idx_branches - sum(self.num_in_each_level[ + 0:idx_levels]) + stripe_size_in_each_level = each_stripe_size * (idx_levels + 1) + start = idx_in_each_level * each_stripe_size + end = start + stripe_size_in_each_level + + k = feat.shape[-1] + local_feat_avgpool = F.avg_pool2d( + feat[:, :, start:end, :], + kernel_size=(stripe_size_in_each_level, k)) + local_feat_maxpool = F.max_pool2d( + feat[:, :, start:end, :], + kernel_size=(stripe_size_in_each_level, k)) + local_feat = local_feat_avgpool + local_feat_maxpool + + local_feat = self.pyramid_conv_list0[used_branches](local_feat) + local_feat = paddle.reshape( + local_feat, shape=[local_feat.shape[0], -1]) + feat_list.append(local_feat) + + local_logits = self.pyramid_fc_list0[used_branches]( + self.dropout_layer(local_feat)) + logits_list.append(local_logits) + + used_branches += 1 + + return feat_list, logits_list + + def forward(self, x): + feat = self.base(x) + assert feat.shape[2] % self.num_stripes == 0 + feat_list, logits_list = self.pyramid_forward(feat) + feat_out = paddle.concat(feat_list, axis=-1) + return feat_out diff --git a/ppdet/modeling/reid/resnet.py b/ppdet/modeling/reid/resnet.py new file mode 100644 index 000000000..02d70f737 --- /dev/null +++ b/ppdet/modeling/reid/resnet.py @@ -0,0 +1,320 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Normal + +__all__ = ["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + dilation=1, + groups=1, + act=None, + lr_mult=1.0, + name=None, + data_format="NCHW"): + super(ConvBNLayer, self).__init__() + conv_stdv = filter_size * filter_size * num_filters + self._conv = nn.Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + dilation=dilation, + groups=groups, + weight_attr=ParamAttr( + name=name + "_weights", + learning_rate=lr_mult, + initializer=Normal(0, math.sqrt(2. / conv_stdv))), + bias_attr=False, + data_format=data_format) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + num_filters, + act=act, + param_attr=ParamAttr(name=bn_name + "_scale"), + bias_attr=ParamAttr(bn_name + "_offset"), + moving_mean_name=bn_name + "_mean", + moving_variance_name=bn_name + "_variance", + data_layout=data_format) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True, + name=None, + lr_mult=1.0, + dilation=1, + data_format="NCHW"): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + dilation=dilation, + act="relu", + lr_mult=lr_mult, + name=name + "_branch2a", + data_format=data_format) + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + dilation=dilation, + stride=stride, + act="relu", + lr_mult=lr_mult, + name=name + "_branch2b", + data_format=data_format) + self.conv2 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 4, + filter_size=1, + dilation=dilation, + act=None, + lr_mult=lr_mult, + name=name + "_branch2c", + data_format=data_format) + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters * 4, + filter_size=1, + dilation=dilation, + stride=stride, + lr_mult=lr_mult, + name=name + "_branch1", + data_format=data_format) + self.shortcut = shortcut + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True, + name=None, + data_format="NCHW"): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=3, + stride=stride, + act="relu", + name=name + "_branch2a", + data_format=data_format) + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + act=None, + name=name + "_branch2b", + data_format=data_format) + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + stride=stride, + name=name + "_branch1", + data_format=data_format) + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Layer): + def __init__(self, + layers=50, + lr_mult=1.0, + last_conv_stride=2, + last_conv_dilation=1): + super(ResNet, self).__init__() + self.layers = layers + self.data_format = "NCHW" + self.input_image_channel = 3 + supported_layers = [18, 34, 50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + self.conv = ConvBNLayer( + num_channels=self.input_image_channel, + num_filters=64, + filter_size=7, + stride=2, + act="relu", + lr_mult=lr_mult, + name="conv1", + data_format=self.data_format) + self.pool2d_max = nn.MaxPool2D( + kernel_size=3, stride=2, padding=1, data_format=self.data_format) + self.block_list = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + if i != 0 or block == 0: + stride = 1 + elif block == len(depth) - 1: + stride = last_conv_stride + else: + stride = 2 + bottleneck_block = self.add_sublayer( + conv_name, + BottleneckBlock( + num_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + num_filters=num_filters[block], + stride=stride, + shortcut=shortcut, + name=conv_name, + lr_mult=lr_mult, + dilation=last_conv_dilation + if block == len(depth) - 1 else 1, + data_format=self.data_format)) + self.block_list.append(bottleneck_block) + shortcut = True + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + conv_name, + BasicBlock( + num_channels=num_channels[block] + if i == 0 else num_filters[block], + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + name=conv_name, + data_format=self.data_format)) + self.block_list.append(basic_block) + shortcut = True + + def forward(self, inputs): + y = self.conv(inputs) + y = self.pool2d_max(y) + for block in self.block_list: + y = block(y) + return y + + +def ResNet18(**args): + model = ResNet(layers=18, **args) + return model + + +def ResNet34(**args): + model = ResNet(layers=34, **args) + return model + + +def ResNet50(pretrained=None, **args): + model = ResNet(layers=50, **args) + if pretrained is not None: + if not (os.path.isdir(pretrained) or + os.path.exists(pretrained + '.pdparams')): + raise ValueError("Model pretrain path {} does not " + "exists.".format(pretrained)) + param_state_dict = paddle.load(pretrained + '.pdparams') + model.set_dict(param_state_dict) + return model + + +def ResNet101(pretrained=None, **args): + model = ResNet(layers=101, **args) + if pretrained is not None: + if not (os.path.isdir(pretrained) or + os.path.exists(pretrained + '.pdparams')): + raise ValueError("Model pretrain path {} does not " + "exists.".format(pretrained)) + param_state_dict = paddle.load(pretrained + '.pdparams') + model.set_dict(param_state_dict) + return model + + +def ResNet152(**args): + model = ResNet(layers=152, **args) + return model diff --git a/requirements.txt b/requirements.txt index f0eb1a8ce..bf87484fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ Cython pycocotools #xtcocotools==1.6 #only for crowdpose setuptools>=42.0.0 +#lap #for mot +#motmetrics #for mot \ No newline at end of file -- GitLab