[MOT] JDE and DeepSORT: model (#2782)

* jde and deepsort: model

[MOT] JDE and DeepSORT: model (#2782)
* jde and deepsort: model
0b4dd432 · George Ni · GitHub · 30dda1ea · 0b4dd432 · 0b4dd432
30 changed file
--- a/ppdet/modeling/__init__.py
+++ b/ppdet/modeling/__init__.py
-# OP docs may contains math formula which may cause
-# DeprecationWarning in string parsing
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
 import warnings
 warnings.filterwarnings(
    action='ignore', category=DeprecationWarning, module='ops')
@@ -13,6 +25,8 @@ from . import losses
 from . import architectures
 from . import post_process
 from . import layers
+from . import reid
+from . import mot

 from .ops import *
 from .backbones import *
@@ -23,3 +37,5 @@ from .losses import *
 from .architectures import *
 from .post_process import *
 from .layers import *
+from .reid import *
+from .mot import *
--- a/ppdet/modeling/architectures/__init__.py
+++ b/ppdet/modeling/architectures/__init__.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
 from . import meta_arch
 from . import faster_rcnn
 from . import mask_rcnn
@@ -17,6 +17,8 @@ from . import ttfnet
 from . import s2anet
 from . import keypoint_hrhrnet
 from . import keypoint_hrnet
+from . import jde
+from . import deepsort

 from .meta_arch import *
 from .faster_rcnn import *
@@ -30,3 +32,5 @@ from .ttfnet import *
 from .s2anet import *
 from .keypoint_hrhrnet import *
 from .keypoint_hrnet import *
+from .jde import *
+from .deepsort import *
--- a/ppdet/modeling/architectures/deepsort.py
+++ b/ppdet/modeling/architectures/deepsort.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
+
+__all__ = ['DeepSORT']
+
+
+@register
+class DeepSORT(BaseArch):
+    """
+    DeepSORT network, see https://arxiv.org/abs/1703.07402
+
+    Args:
+        detector (object): detector model instance
+        reid (object): reid model instance
+        tracker (object): tracker instance
+    """
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 detector='YOLOv3',
+                 reid='PCBPlusDropoutPyramid',
+                 tracker='DeepSORTTracker'):
+        super(DeepSORT, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        if cfg['detector'] != 'None':
+            detector = create(cfg['detector'])
+        else:
+            detector = None
+        reid = create(cfg['reid'])
+        tracker = create(cfg['tracker'])
+
+        return {
+            "detector": detector,
+            "reid": reid,
+            "tracker": tracker,
+        }
+
+    def _forward(self):
+        assert 'ori_image' in self.inputs
+        load_dets = 'pred_bboxes' in self.inputs and 'pred_scores' in self.inputs
+
+        ori_image = self.inputs['ori_image']
+        input_shape = self.inputs['image'].shape[2:]
+        im_shape = self.inputs['im_shape']
+        scale_factor = self.inputs['scale_factor']
+
+        if self.detector and not load_dets:
+            outs = self.detector(self.inputs)
+            if outs['bbox_num'] > 0:
+                pred_bboxes = scale_coords(outs['bbox'][:, 2:], input_shape,
+                                           im_shape, scale_factor)
+                pred_scores = outs['bbox'][:, 1:2]
+            else:
+                pred_bboxes = []
+                pred_scores = []
+        else:
+            pred_bboxes = self.inputs['pred_bboxes']
+            pred_scores = self.inputs['pred_scores']
+
+        if len(pred_bboxes) > 0:
+            pred_bboxes = clip_box(pred_bboxes, input_shape, im_shape,
+                                   scale_factor)
+            bbox_tlwh = paddle.concat(
+                (pred_bboxes[:, 0:2],
+                 pred_bboxes[:, 2:4] - pred_bboxes[:, 0:2] + 1),
+                axis=1)
+
+            crops, pred_scores = get_crops(
+                pred_bboxes, ori_image, pred_scores, w=64, h=192)
+
+            if len(crops) > 0:
+                features = self.reid(paddle.to_tensor(crops))
+                detections = [Detection(bbox_tlwh[i], conf, features[i])\
+                                        for i, conf in enumerate(pred_scores)]
+            else:
+                detections = []
+        else:
+            detections = []
+
+        self.tracker.predict()
+        online_targets = self.tracker.update(detections)
+
+        return online_targets
+
+    def get_pred(self):
+        return self._forward()
--- a/ppdet/modeling/architectures/jde.py
+++ b/ppdet/modeling/architectures/jde.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.modeling.mot.utils import scale_coords
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['JDE']
+
+
+@register
+class JDE(BaseArch):
+    """
+    JDE network, see https://arxiv.org/abs/1909.12605v1
+
+    Args:
+        detector (object): detector model instance
+        reid (object): reid model instance
+        tracker (object): tracker instance
+        test_mode (str): 'detection', 'embedding' or 'tracking'
+    """
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 detector='YOLOv3',
+                 reid='JDEEmbeddingHead',
+                 tracker='JDETracker',
+                 test_mode='detection'):
+        super(JDE, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+        self.test_mode = test_mode
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+        kwargs = {'input_shape': detector.neck.out_shape}
+
+        reid = create(cfg['reid'], **kwargs)
+
+        tracker = create(cfg['tracker'])
+
+        return {
+            "detector": detector,
+            "reid": reid,
+            "tracker": tracker,
+        }
+
+    def _forward(self):
+        det_outs = self.detector(self.inputs)
+
+        if self.training:
+            emb_feats = det_outs['emb_feats']
+            loss_confs = det_outs['det_losses']['loss_confs']
+            loss_boxes = det_outs['det_losses']['loss_boxes']
+            jde_losses = self.reid(emb_feats, self.inputs, loss_confs,
+                                   loss_boxes)
+            return jde_losses
+        else:
+            if self.test_mode == 'detection':
+                det_results = {
+                    'bbox': det_outs['bbox'],
+                    'bbox_num': det_outs['bbox_num'],
+                }
+                return det_results
+
+            elif self.test_mode == 'embedding':
+                emb_feats = det_outs['emb_feats']
+                embs_and_gts = self.reid(emb_feats, self.inputs, test_emb=True)
+                return embs_and_gts
+
+            elif self.test_mode == 'tracking':
+                emb_feats = det_outs['emb_feats']
+                emb_outs = self.reid(emb_feats, self.inputs)
+
+                boxes_idx = det_outs['boxes_idx']
+                bbox = det_outs['bbox']
+
+                input_shape = self.inputs['image'].shape[2:]
+                im_shape = self.inputs['im_shape']
+                scale_factor = self.inputs['scale_factor']
+
+                bbox[:, 2:] = scale_coords(bbox[:, 2:], input_shape, im_shape,
+                                           scale_factor)
+
+                nms_keep_idx = det_outs['nms_keep_idx']
+
+                pred_dets = paddle.concat((bbox[:, 2:], bbox[:, 1:2]), axis=1)
+
+                emb_valid = paddle.gather_nd(emb_outs, boxes_idx)
+                pred_embs = paddle.gather_nd(emb_valid, nms_keep_idx)
+
+                online_targets = self.tracker.update(pred_dets, pred_embs)
+                return online_targets
+
+            else:
+                raise ValueError("Unknown test_mode {}.".format(self.test_mode))
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/ppdet/modeling/architectures/yolo.py
+++ b/ppdet/modeling/architectures/yolo.py
@@ -19,7 +19,8 @@ class YOLOv3(BaseArch):
                 neck='YOLOv3FPN',
                 yolo_head='YOLOv3Head',
                 post_process='BBoxPostProcess',
-                 data_format='NCHW'):
+                 data_format='NCHW',
+                 for_mot=False):
        """
        YOLOv3 network, see https://arxiv.org/abs/1804.02767

@@ -29,12 +30,14 @@ class YOLOv3(BaseArch):
            yolo_head (nn.Layer): anchor_head instance
            bbox_post_process (object): `BBoxPostProcess` instance
            data_format (str): data format, NCHW or NHWC
+            for_mot (bool): whether return other features used in tracking model 
        """
        super(YOLOv3, self).__init__(data_format=data_format)
        self.backbone = backbone
        self.neck = neck
        self.yolo_head = yolo_head
        self.post_process = post_process
+        self.for_mot = for_mot

    @classmethod
    def from_config(cls, cfg, *args, **kwargs):
@@ -57,21 +60,44 @@ class YOLOv3(BaseArch):

    def _forward(self):
        body_feats = self.backbone(self.inputs)
-        body_feats = self.neck(body_feats)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if isinstance(neck_feats, dict):
+            assert self.for_mot == True
+            emb_feats = neck_feats['emb_feats']
+            neck_feats = neck_feats['yolo_feats']

        if self.training:
-            return self.yolo_head(body_feats, self.inputs)
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+
+            if self.for_mot:
+                return {'det_losses': yolo_losses, 'emb_feats': emb_feats}
+            else:
+                return yolo_losses
+
        else:
-            yolo_head_outs = self.yolo_head(body_feats)
-            bbox, bbox_num = self.post_process(
-                yolo_head_outs, self.yolo_head.mask_anchors,
-                self.inputs['im_shape'], self.inputs['scale_factor'])
-            return bbox, bbox_num
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.for_mot:
+                boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors)
+                output = {
+                    'bbox': bbox,
+                    'bbox_num': bbox_num,
+                    'boxes_idx': boxes_idx,
+                    'nms_keep_idx': nms_keep_idx,
+                    'emb_feats': emb_feats,
+                }
+            else:
+                bbox, bbox_num = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output

    def get_loss(self):
        return self._forward()

    def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-        return output
+        return self._forward()
--- a/ppdet/modeling/backbones/darknet.py
+++ b/ppdet/modeling/backbones/darknet.py
@@ -35,6 +35,7 @@ class ConvBNLayer(nn.Layer):
                 norm_type='bn',
                 norm_decay=0.,
                 act="leaky",
+                 freeze_norm=False,
                 data_format='NCHW',
                 name=''):
        """
@@ -50,6 +51,7 @@ class ConvBNLayer(nn.Layer):
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
            act (str): activation function type, default 'leaky', which means leaky_relu
+            freeze_norm (bool): whether to freeze norm, default False
            data_format (str): data format, NCHW or NHWC
        """
        super(ConvBNLayer, self).__init__()
@@ -67,6 +69,7 @@ class ConvBNLayer(nn.Layer):
            ch_out,
            norm_type=norm_type,
            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
            data_format=data_format)
        self.act = act

@@ -89,6 +92,7 @@ class DownSample(nn.Layer):
                 padding=1,
                 norm_type='bn',
                 norm_decay=0.,
+                 freeze_norm=False,
                 data_format='NCHW'):
        """
        downsample layer
@@ -101,6 +105,7 @@ class DownSample(nn.Layer):
            padding (int): padding size, default 1
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
            data_format (str): data format, NCHW or NHWC
        """

@@ -114,6 +119,7 @@ class DownSample(nn.Layer):
            padding=padding,
            norm_type=norm_type,
            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
            data_format=data_format)
        self.ch_out = ch_out

@@ -128,6 +134,7 @@ class BasicBlock(nn.Layer):
                 ch_out,
                 norm_type='bn',
                 norm_decay=0.,
+                 freeze_norm=False,
                 data_format='NCHW'):
        """
        BasicBlock layer of DarkNet
@@ -137,6 +144,7 @@ class BasicBlock(nn.Layer):
            ch_out (int): output channel
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
            data_format (str): data format, NCHW or NHWC
        """

@@ -150,6 +158,7 @@ class BasicBlock(nn.Layer):
            padding=0,
            norm_type=norm_type,
            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
            data_format=data_format)
        self.conv2 = ConvBNLayer(
            ch_in=ch_out,
@@ -159,6 +168,7 @@ class BasicBlock(nn.Layer):
            padding=1,
            norm_type=norm_type,
            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
            data_format=data_format)

    def forward(self, inputs):
@@ -175,6 +185,7 @@ class Blocks(nn.Layer):
                 count,
                 norm_type='bn',
                 norm_decay=0.,
+                 freeze_norm=False,
                 name=None,
                 data_format='NCHW'):
        """
@@ -186,6 +197,7 @@ class Blocks(nn.Layer):
            count (int): number of BasicBlock layer
            norm_type (str): batch norm type, default bn
            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
            name (str): layer name
            data_format (str): data format, NCHW or NHWC
        """
@@ -196,6 +208,7 @@ class Blocks(nn.Layer):
            ch_out,
            norm_type=norm_type,
            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
            data_format=data_format)
        self.res_out_list = []
        for i in range(1, count):
@@ -207,6 +220,7 @@ class Blocks(nn.Layer):
                    ch_out,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
                    data_format=data_format))
            self.res_out_list.append(res_out)
        self.ch_out = ch_out
@@ -233,6 +247,7 @@ class DarkNet(nn.Layer):
                 num_stages=5,
                 norm_type='bn',
                 norm_decay=0.,
+                 freeze_norm=False,
                 data_format='NCHW'):
        """
        Darknet, see https://pjreddie.com/darknet/yolo/
@@ -261,6 +276,7 @@ class DarkNet(nn.Layer):
            padding=1,
            norm_type=norm_type,
            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
            data_format=data_format)

        self.downsample0 = DownSample(
@@ -268,6 +284,7 @@ class DarkNet(nn.Layer):
            ch_out=32 * 2,
            norm_type=norm_type,
            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
            data_format=data_format)

        self._out_channels = []
@@ -284,6 +301,7 @@ class DarkNet(nn.Layer):
                    stage,
                    norm_type=norm_type,
                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
                    data_format=data_format,
                    name=name))
            self.darknet_conv_block_list.append(conv_block)
@@ -298,6 +316,7 @@ class DarkNet(nn.Layer):
                    ch_out=32 * (2**(i + 2)),
                    norm_type=norm_type,
                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
                    data_format=data_format))
            self.downsample_list.append(downsample)


--- a/ppdet/modeling/layers.py
+++ b/ppdet/modeling/layers.py
@@ -836,6 +836,111 @@ class TTFBox(object):
        return results, paddle.shape(results)[0:1]


+@register
+@serializable
+class JDEBox(object):
+    __shared__ = ['num_classes']
+
+    def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32):
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.downsample_ratio = downsample_ratio
+
+    def generate_anchor(self, nGh, nGw, anchor_wh):
+        nA = len(anchor_wh)
+        yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)])
+        mesh = paddle.stack(
+            (xv, yv), axis=0).cast(dtype='float32')  # 2 x nGh x nGw
+        meshs = paddle.tile(mesh, [nA, 1, 1, 1])
+
+        anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat(
+            int(nGh), axis=-2).repeat(
+                int(nGw), axis=-1)
+        anchor_offset_mesh = paddle.to_tensor(
+            anchor_offset_mesh.astype(np.float32))
+        # nA x 2 x nGh x nGw
+
+        anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1)
+        anchor_mesh = paddle.transpose(anchor_mesh,
+                                       [0, 2, 3, 1])  # (nA x nGh x nGw) x 4
+        return anchor_mesh
+
+    def decode_delta(self, delta, fg_anchor_list):
+        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
+                        fg_anchor_list[:, 2], fg_anchor_list[:,3]
+        dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]
+        gx = pw * dx + px
+        gy = ph * dy + py
+        gw = pw * paddle.exp(dw)
+        gh = ph * paddle.exp(dh)
+        gx1 = gx - gw * 0.5
+        gy1 = gy - gh * 0.5
+        gx2 = gx + gw * 0.5
+        gy2 = gy + gh * 0.5
+        return paddle.stack([gx1, gy1, gx2, gy2], axis=1)
+
+    def decode_delta_map(self, delta_map, anchors):
+        delta_map_shape = paddle.shape(delta_map)
+        delta_map_shape.stop_gradient = True
+        nB, nA, nGh, nGw, _ = delta_map_shape[:]
+        anchor_mesh = self.generate_anchor(nGh, nGw, anchors)
+        # only support bs=1
+        anchor_mesh = paddle.unsqueeze(anchor_mesh, 0)
+
+        pred_list = self.decode_delta(
+            paddle.reshape(
+                delta_map, shape=[-1, 4]),
+            paddle.reshape(
+                anchor_mesh, shape=[-1, 4]))
+        pred_map = paddle.reshape(pred_list, shape=[nB, -1, 4])
+        return pred_map
+
+    def __call__(self, yolo_head_out, anchors):
+        bbox_pred_list = []
+        for i, head_out in enumerate(yolo_head_out):
+            stride = self.downsample_ratio // 2**i
+            anc_w, anc_h = anchors[i][0::2], anchors[i][1::2]
+            anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride
+            nA = len(anc_w)
+            boxes_shape = paddle.shape(head_out)
+            boxes_shape.stop_gradient = True
+            nB, nGh, nGw = boxes_shape[0], boxes_shape[-2], boxes_shape[-1]
+
+            p = head_out.reshape((nB, nA, self.num_classes + 5, nGh, nGw))
+            p = paddle.transpose(p, perm=[0, 1, 3, 4, 2])  # [nB, 4, 19, 34, 6]
+            p_box = p[:, :, :, :, :4]  # [nB, 4, 19, 34, 4]
+            boxes = self.decode_delta_map(p_box, anchor_vec)  # [nB, 4*19*34, 4]
+            boxes = boxes * stride
+
+            p_conf = paddle.transpose(
+                p[:, :, :, :, 4:6], perm=[0, 4, 1, 2, 3])  # [nB, 2, 4, 19, 34]
+            p_conf = F.softmax(
+                p_conf,
+                axis=1)[:, 1, :, :, :].unsqueeze(-1)  # [nB, 4, 19, 34, 1]
+            scores = paddle.reshape(p_conf, shape=[nB, -1, 1])
+
+            bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1))
+
+        yolo_boxes_pred = paddle.concat(bbox_pred_list, axis=1)
+        boxes_idx = paddle.nonzero(yolo_boxes_pred[:, :, -1] > self.conf_thresh)
+        boxes_idx.stop_gradient = True
+        if boxes_idx.shape[0] == 0:  # TODO: deploy
+            boxes_idx = paddle.to_tensor(np.array([[0]], dtype='int64'))
+            yolo_boxes_out = paddle.to_tensor(
+                np.array(
+                    [[[0.0, 0.0, 0.0, 0.0]]], dtype='float32'))
+            yolo_scores_out = paddle.to_tensor(
+                np.array(
+                    [[[0.0]]], dtype='float32'))
+            return boxes_idx, yolo_boxes_out, yolo_scores_out
+
+        yolo_boxes = paddle.gather_nd(yolo_boxes_pred, boxes_idx)
+        yolo_boxes_out = paddle.reshape(yolo_boxes[:, :4], shape=[nB, -1, 4])
+        yolo_scores_out = paddle.reshape(yolo_boxes[:, 4:5], shape=[nB, 1, -1])
+        boxes_idx = boxes_idx[:, 1:]
+        return boxes_idx, yolo_boxes_out, yolo_scores_out  # [163], [1, 163, 4], [1, 1, 163]
+
+
 @register
 @serializable
 class MaskMatrixNMS(object):

--- a/ppdet/modeling/losses/__init__.py
+++ b/ppdet/modeling/losses/__init__.py
@@ -20,6 +20,7 @@ from . import fcos_loss
 from . import solov2_loss
 from . import ctfocal_loss
 from . import keypoint_loss
+from . import jde_loss

 from .yolo_loss import *
 from .iou_aware_loss import *
@@ -29,3 +30,4 @@ from .fcos_loss import *
 from .solov2_loss import *
 from .ctfocal_loss import *
 from .keypoint_loss import *
+from .jde_loss import *
--- a/ppdet/modeling/losses/jde_loss.py
+++ b/ppdet/modeling/losses/jde_loss.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+__all__ = ['JDEDetectionLoss', 'JDEEmbeddingLoss', 'JDELoss']
+
+
+@register
+class JDEDetectionLoss(nn.Layer):
+    __shared__ = ['num_classes']
+
+    def __init__(self, num_classes=1):
+        super(JDEDetectionLoss, self).__init__()
+        self.num_classes = num_classes
+
+    def det_loss(self, p_det, anchor, t_conf, t_box):
+        pshape = paddle.shape(p_det)
+        pshape.stop_gradient = True
+        nB, nGh, nGw = pshape[0], pshape[-2], pshape[-1]
+        nA = len(anchor)
+        p_det = paddle.reshape(
+            p_det, [nB, nA, self.num_classes + 5, nGh, nGw]).transpose(
+                (0, 1, 3, 4, 2))
+
+        # 1. loss_conf: cross_entropy
+        p_conf = p_det[:, :, :, :, 4:6]
+        p_conf_flatten = paddle.reshape(p_conf, [-1, 2])
+        t_conf_flatten = t_conf.flatten()
+        t_conf_flatten = paddle.cast(t_conf_flatten, dtype="int64")
+        t_conf_flatten.stop_gradient = True
+        loss_conf = F.cross_entropy(
+            p_conf_flatten, t_conf_flatten, ignore_index=-1, reduction='mean')
+        loss_conf.stop_gradient = False
+
+        # 2. loss_box: smooth_l1_loss
+        p_box = p_det[:, :, :, :, :4]
+        p_box_flatten = paddle.reshape(p_box, [-1, 4])
+        t_box_flatten = paddle.reshape(t_box, [-1, 4])
+        fg_inds = paddle.nonzero(t_conf_flatten > 0).flatten()
+        if fg_inds.numel() > 0:
+            reg_delta = paddle.gather(p_box_flatten, fg_inds)
+            reg_target = paddle.gather(t_box_flatten, fg_inds)
+        else:
+            reg_delta = paddle.to_tensor([0, 0, 0, 0], dtype='float32')
+            reg_delta.stop_gradient = False
+            reg_target = paddle.to_tensor([0, 0, 0, 0], dtype='float32')
+        reg_target.stop_gradient = True
+        loss_box = F.smooth_l1_loss(
+            reg_delta, reg_target, reduction='mean', delta=1.0)
+        loss_box.stop_gradient = False
+
+        return loss_conf, loss_box
+
+    def forward(self, det_outs, targets, anchors):
+        """
+        Args:
+            det_outs (list[Tensor]): output from detection head, each one
+                is a 4-D Tensor with shape [N, C, H, W].
+            targets (dict): contains 'im_id', 'gt_bbox', 'gt_ide', 'image',
+                'im_shape', 'scale_factor' and 'tbox', 'tconf', 'tide' of
+                each FPN level.
+            anchors (list[list]): anchor setting of JDE model, N row M col, N is
+                the anchor levels(FPN levels), M is the anchor scales each
+                level.
+        """
+        assert len(det_outs) == len(anchors)
+        loss_confs = []
+        loss_boxes = []
+        for i, (p_det, anchor) in enumerate(zip(det_outs, anchors)):
+            t_conf = targets['tconf{}'.format(i)]
+            t_box = targets['tbox{}'.format(i)]
+
+            loss_conf, loss_box = self.det_loss(p_det, anchor, t_conf, t_box)
+            loss_confs.append(loss_conf)
+            loss_boxes.append(loss_box)
+        return {'loss_confs': loss_confs, 'loss_boxes': loss_boxes}
+
+
+@register
+class JDEEmbeddingLoss(nn.Layer):
+    def __init__(self, ):
+        super(JDEEmbeddingLoss, self).__init__()
+        self.phony = self.create_parameter(shape=[1], dtype="float32")
+
+    def emb_loss(self, p_ide, t_conf, t_ide, emb_scale, classifier):
+        emb_dim = p_ide.shape[1]
+        p_ide = p_ide.transpose((0, 2, 3, 1))
+        p_ide_flatten = paddle.reshape(p_ide, [-1, emb_dim])
+        mask = t_conf > 0
+        mask = paddle.cast(mask, dtype="int64")
+        mask.stop_gradient = True
+        emb_mask = mask.max(1).flatten()
+        emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
+        emb_mask_inds.stop_gradient = True
+        # use max(1) to decide the id, TODO: more reseanable strategy
+        t_ide_flatten = t_ide.max(1).flatten()
+        t_ide_flatten = paddle.cast(t_ide_flatten, dtype="int64")
+        valid_inds = paddle.nonzero(t_ide_flatten != -1).flatten()
+
+        if emb_mask_inds.numel() == 0 or valid_inds.numel() == 0:
+            # loss_ide = paddle.to_tensor([0]) # will be error in gradient backward
+            loss_ide = self.phony * 0  # todo
+        else:
+            embedding = paddle.gather(p_ide_flatten, emb_mask_inds)
+            embedding = emb_scale * F.normalize(embedding)
+            logits = classifier(embedding)
+
+            ide_target = paddle.gather(t_ide_flatten, emb_mask_inds)
+
+            loss_ide = F.cross_entropy(
+                logits, ide_target, ignore_index=-1, reduction='mean')
+        loss_ide.stop_gradient = False
+
+        return loss_ide
+
+    def forward(self, ide_outs, targets, emb_scale, classifier):
+        loss_ides = []
+        for i, p_ide in enumerate(ide_outs):
+            t_conf = targets['tconf{}'.format(i)]
+            t_ide = targets['tide{}'.format(i)]
+
+            loss_ide = self.emb_loss(p_ide, t_conf, t_ide, emb_scale,
+                                     classifier)
+            loss_ides.append(loss_ide)
+        return loss_ides
+
+
+@register
+class JDELoss(nn.Layer):
+    def __init__(self):
+        super(JDELoss, self).__init__()
+
+    def forward(self, loss_confs, loss_boxes, loss_ides, loss_params_cls,
+                loss_params_reg, loss_params_ide, targets):
+        assert len(loss_confs) == len(loss_boxes) == len(loss_ides)
+        assert len(loss_params_cls) == len(loss_params_reg) == len(
+            loss_params_ide)
+        assert len(loss_confs) == len(loss_params_cls)
+
+        batchsize = targets['gt_bbox'].shape[0]
+        nTargets = paddle.nonzero(paddle.sum(targets['gt_bbox'], axis=2)).shape[
+            0] / batchsize
+        nTargets = paddle.to_tensor(nTargets, dtype='float32')
+        nTargets.stop_gradient = True
+
+        jde_losses = []
+        for i, (loss_conf, loss_box, loss_ide, l_conf_p, l_box_p,
+                l_ide_p) in enumerate(
+                    zip(loss_confs, loss_boxes, loss_ides, loss_params_cls,
+                        loss_params_reg, loss_params_ide)):
+
+            jde_loss = l_conf_p(loss_conf) + l_box_p(loss_box) + l_ide_p(
+                loss_ide)
+            jde_losses.append(jde_loss)
+
+        loss_all = {
+            "loss_conf": sum(loss_confs),
+            "loss_box": sum(loss_boxes),
+            "loss_ide": sum(loss_ides),
+            "loss": sum(jde_losses),
+            "nTargets": nTargets,
+        }
+        return loss_all
--- a/ppdet/modeling/mot/__init__.py
+++ b/ppdet/modeling/mot/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import matching
+from . import tracker
+from . import motion
+from . import visualization
+from . import utils
+
+from .matching import *
+from .tracker import *
+from .motion import *
+from .visualization import *
+from .utils import *
--- a/ppdet/modeling/mot/matching/__init__.py
+++ b/ppdet/modeling/mot/matching/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import jde_matching
+from . import deepsort_matching
+
+from .jde_matching import *
+from .deepsort_matching import *
--- a/ppdet/modeling/mot/matching/deepsort_matching.py
+++ b/ppdet/modeling/mot/matching/deepsort_matching.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is borrow from https://github.com/nwojke/deep_sort/tree/master/deep_sort
+"""
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from ..motion import kalman_filter
+
+INFTY_COST = 1e+5
+
+__all__ = [
+    'iou_1toN',
+    'iou_cost',
+    '_nn_euclidean_distance',
+    '_nn_cosine_distance',
+    'NearestNeighborDistanceMetric',
+    'min_cost_matching',
+    'matching_cascade',
+    'gate_cost_matrix',
+]
+
+
+def iou_1toN(bbox, candidates):
+    """
+    Computer intersection over union (IoU) by one box to N candidates.
+
+    Args:
+        bbox (ndarray): A bounding box in format `(top left x, top left y, width, height)`.
+            candidates (ndarray): A matrix of candidate bounding boxes (one per row) in the
+            same format as `bbox`.
+
+    Returns:
+        ious (ndarray): The intersection over union in [0, 1] between the `bbox`
+            and each candidate. A higher score means a larger fraction of the
+            `bbox` is occluded by the candidate.
+    """
+    bbox_tl = bbox[:2]
+    bbox_br = bbox[:2] + bbox[2:]
+    candidates_tl = candidates[:, :2]
+    candidates_br = candidates[:, :2] + candidates[:, 2:]
+
+    tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis],
+               np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]]
+    br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis],
+               np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]]
+    wh = np.maximum(0., br - tl)
+
+    area_intersection = wh.prod(axis=1)
+    area_bbox = bbox[2:].prod()
+    area_candidates = candidates[:, 2:].prod(axis=1)
+    ious = area_intersection / (area_bbox + area_candidates - area_intersection)
+    return ious
+
+
+def iou_cost(tracks, detections, track_indices=None, detection_indices=None):
+    """
+    IoU distance metric.
+
+    Args:
+        tracks (list[Track]): A list of tracks.
+        detections (list[Detection]): A list of detections.
+        track_indices (Optional[list[int]]): A list of indices to tracks that
+            should be matched. Defaults to all `tracks`.
+        detection_indices (Optional[list[int]]): A list of indices to detections
+            that should be matched. Defaults to all `detections`.
+
+    Returns:
+        cost_matrix (ndarray): A cost matrix of shape len(track_indices), 
+            len(detection_indices) where entry (i, j) is 
+            `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`.
+    """
+    if track_indices is None:
+        track_indices = np.arange(len(tracks))
+    if detection_indices is None:
+        detection_indices = np.arange(len(detections))
+
+    cost_matrix = np.zeros((len(track_indices), len(detection_indices)))
+    for row, track_idx in enumerate(track_indices):
+        if tracks[track_idx].time_since_update > 1:
+            cost_matrix[row, :] = 1e+5
+            continue
+
+        bbox = tracks[track_idx].to_tlwh()
+        candidates = np.asarray([detections[i].tlwh for i in detection_indices])
+        cost_matrix[row, :] = 1. - iou_1toN(bbox, candidates)
+    return cost_matrix
+
+
+def _nn_euclidean_distance(s, q):
+    """
+    Compute pair-wise squared (Euclidean) distance between points in `s` and `q`.
+
+    Args:
+        s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M.
+        q (ndarray): Query points: an LxM matrix of L samples of dimensionality M.
+
+    Returns:
+        distances (ndarray): A vector of length M that contains for each entry in `q` the
+            smallest Euclidean distance to a sample in `s`.
+    """
+    s, q = np.asarray(s), np.asarray(q)
+    if len(s) == 0 or len(q) == 0:
+        return np.zeros((len(s), len(q)))
+    s2, q2 = np.square(s).sum(axis=1), np.square(q).sum(axis=1)
+    distances = -2. * np.dot(s, q.T) + s2[:, None] + q2[None, :]
+    distances = np.clip(distances, 0., float(np.inf))
+
+    return np.maximum(0.0, distances.min(axis=0))
+
+
+def _nn_cosine_distance(s, q):
+    """
+    Compute pair-wise cosine distance between points in `s` and `q`.
+
+    Args:
+        s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M.
+        q (ndarray): Query points: an LxM matrix of L samples of dimensionality M.
+
+    Returns:
+        distances (ndarray): A vector of length M that contains for each entry in `q` the
+            smallest Euclidean distance to a sample in `s`.
+    """
+    s = np.asarray(s) / np.linalg.norm(s, axis=1, keepdims=True)
+    q = np.asarray(q) / np.linalg.norm(q, axis=1, keepdims=True)
+    distances = 1. - np.dot(s, q.T)
+
+    return distances.min(axis=0)
+
+
+class NearestNeighborDistanceMetric(object):
+    """
+    A nearest neighbor distance metric that, for each target, returns
+    the closest distance to any sample that has been observed so far.
+
+    Args:
+        metric (str): Either "euclidean" or "cosine".
+        matching_threshold (float): The matching threshold. Samples with larger
+            distance are considered an invalid match.
+        budget (Optional[int]): If not None, fix samples per class to at most
+            this number. Removes the oldest samples when the budget is reached.
+
+    Attributes: 
+        samples (Dict[int -> List[ndarray]]): A dictionary that maps from target
+            identities to the list of samples that have been observed so far.
+    """
+
+    def __init__(self, metric, matching_threshold, budget=None):
+        if metric == "euclidean":
+            self._metric = _nn_euclidean_distance
+        elif metric == "cosine":
+            self._metric = _nn_cosine_distance
+        else:
+            raise ValueError(
+                "Invalid metric; must be either 'euclidean' or 'cosine'")
+        self.matching_threshold = matching_threshold
+        self.budget = budget
+        self.samples = {}
+
+    def partial_fit(self, features, targets, active_targets):
+        """
+        Update the distance metric with new data.
+
+        Args:
+            features (ndarray): An NxM matrix of N features of dimensionality M.
+            targets (ndarray): An integer array of associated target identities.
+            active_targets (List[int]): A list of targets that are currently
+                present in the scene.
+        """
+        for feature, target in zip(features, targets):
+            self.samples.setdefault(target, []).append(feature)
+            if self.budget is not None:
+                self.samples[target] = self.samples[target][-self.budget:]
+        self.samples = {k: self.samples[k] for k in active_targets}
+
+    def distance(self, features, targets):
+        """
+        Compute distance between features and targets.
+
+        Args:
+            features (ndarray): An NxM matrix of N features of dimensionality M.
+            targets (list[int]): A list of targets to match the given `features` against.
+
+        Returns:
+            cost_matrix (ndarray): a cost matrix of shape len(targets), len(features),
+                where element (i, j) contains the closest squared distance between
+                `targets[i]` and `features[j]`.
+        """
+        cost_matrix = np.zeros((len(targets), len(features)))
+        for i, target in enumerate(targets):
+            cost_matrix[i, :] = self._metric(self.samples[target], features)
+        return cost_matrix
+
+
+def min_cost_matching(distance_metric,
+                      max_distance,
+                      tracks,
+                      detections,
+                      track_indices=None,
+                      detection_indices=None):
+    """
+    Solve linear assignment problem.
+
+    Args:
+        distance_metric :
+            Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray
+            The distance metric is given a list of tracks and detections as 
+            well as a list of N track indices and M detection indices. The 
+            metric should return the NxM dimensional cost matrix, where element
+            (i, j) is the association cost between the i-th track in the given
+            track indices and the j-th detection in the given detection_indices.
+        max_distance (float): Gating threshold. Associations with cost larger
+            than this value are disregarded.
+        tracks (list[Track]): A list of predicted tracks at the current time
+            step.
+        detections (list[Detection]): A list of detections at the current time
+            step.
+        track_indices (list[int]): List of track indices that maps rows in
+            `cost_matrix` to tracks in `tracks`.
+        detection_indices (List[int]): List of detection indices that maps
+            columns in `cost_matrix` to detections in `detections`.
+
+    Returns:
+        A tuple (List[(int, int)], List[int], List[int]) with the following
+        three entries:
+            * A list of matched track and detection indices.
+            * A list of unmatched track indices.
+            * A list of unmatched detection indices.
+    """
+    if track_indices is None:
+        track_indices = np.arange(len(tracks))
+    if detection_indices is None:
+        detection_indices = np.arange(len(detections))
+
+    if len(detection_indices) == 0 or len(track_indices) == 0:
+        return [], track_indices, detection_indices  # Nothing to match.
+
+    cost_matrix = distance_metric(tracks, detections, track_indices,
+                                  detection_indices)
+
+    cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5
+    indices = linear_sum_assignment(cost_matrix)
+
+    matches, unmatched_tracks, unmatched_detections = [], [], []
+    for col, detection_idx in enumerate(detection_indices):
+        if col not in indices[1]:
+            unmatched_detections.append(detection_idx)
+    for row, track_idx in enumerate(track_indices):
+        if row not in indices[0]:
+            unmatched_tracks.append(track_idx)
+    for row, col in zip(indices[0], indices[1]):
+        track_idx = track_indices[row]
+        detection_idx = detection_indices[col]
+        if cost_matrix[row, col] > max_distance:
+            unmatched_tracks.append(track_idx)
+            unmatched_detections.append(detection_idx)
+        else:
+            matches.append((track_idx, detection_idx))
+    return matches, unmatched_tracks, unmatched_detections
+
+
+def matching_cascade(distance_metric,
+                     max_distance,
+                     cascade_depth,
+                     tracks,
+                     detections,
+                     track_indices=None,
+                     detection_indices=None):
+    """
+    Run matching cascade.
+
+    Args:
+        distance_metric :
+            Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray
+            The distance metric is given a list of tracks and detections as 
+            well as a list of N track indices and M detection indices. The 
+            metric should return the NxM dimensional cost matrix, where element
+            (i, j) is the association cost between the i-th track in the given
+            track indices and the j-th detection in the given detection_indices.
+        max_distance (float): Gating threshold. Associations with cost larger
+            than this value are disregarded.
+        cascade_depth (int): The cascade depth, should be se to the maximum
+            track age.
+        tracks (list[Track]): A list of predicted tracks at the current time
+            step.
+        detections (list[Detection]): A list of detections at the current time
+            step.
+        track_indices (list[int]): List of track indices that maps rows in
+            `cost_matrix` to tracks in `tracks`.
+        detection_indices (List[int]): List of detection indices that maps
+            columns in `cost_matrix` to detections in `detections`.
+
+    Returns:
+        A tuple (List[(int, int)], List[int], List[int]) with the following
+        three entries:
+            * A list of matched track and detection indices.
+            * A list of unmatched track indices.
+            * A list of unmatched detection indices.
+    """
+    if track_indices is None:
+        track_indices = list(range(len(tracks)))
+    if detection_indices is None:
+        detection_indices = list(range(len(detections)))
+
+    unmatched_detections = detection_indices
+    matches = []
+    for level in range(cascade_depth):
+        if len(unmatched_detections) == 0:  # No detections left
+            break
+
+        track_indices_l = [
+            k for k in track_indices if tracks[k].time_since_update == 1 + level
+        ]
+        if len(track_indices_l) == 0:  # Nothing to match at this level
+            continue
+
+        matches_l, _, unmatched_detections = \
+            min_cost_matching(
+                distance_metric, max_distance, tracks, detections,
+                track_indices_l, unmatched_detections)
+        matches += matches_l
+    unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches))
+    return matches, unmatched_tracks, unmatched_detections
+
+
+def gate_cost_matrix(kf,
+                     cost_matrix,
+                     tracks,
+                     detections,
+                     track_indices,
+                     detection_indices,
+                     gated_cost=INFTY_COST,
+                     only_position=False):
+    """
+    Invalidate infeasible entries in cost matrix based on the state
+    distributions obtained by Kalman filtering.
+
+    Args:
+        kf (object): The Kalman filter.
+        cost_matrix (ndarray): The NxM dimensional cost matrix, where N is the
+            number of track indices and M is the number of detection indices,
+            such that entry (i, j) is the association cost between
+            `tracks[track_indices[i]]` and `detections[detection_indices[j]]`.
+        tracks (list[Track]): A list of predicted tracks at the current time
+            step.
+        detections (list[Detection]): A list of detections at the current time
+            step.
+        track_indices (List[int]): List of track indices that maps rows in
+            `cost_matrix` to tracks in `tracks`.
+        detection_indices (List[int]): List of detection indices that maps
+            columns in `cost_matrix` to detections in `detections`.
+        gated_cost (Optional[float]): Entries in the cost matrix corresponding
+            to infeasible associations are set this value. Defaults to a very
+            large value.
+        only_position (Optional[bool]): If True, only the x, y position of the
+            state distribution is considered during gating. Default False.
+    """
+    gating_dim = 2 if only_position else 4
+    gating_threshold = kalman_filter.chi2inv95[gating_dim]
+    measurements = np.asarray(
+        [detections[i].to_xyah() for i in detection_indices])
+    for row, track_idx in enumerate(track_indices):
+        track = tracks[track_idx]
+        gating_distance = kf.gating_distance(track.mean, track.covariance,
+                                             measurements, only_position)
+        cost_matrix[row, gating_distance > gating_threshold] = gated_cost
+    return cost_matrix
--- a/ppdet/modeling/mot/matching/jde_matching.py
+++ b/ppdet/modeling/mot/matching/jde_matching.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is borrow from https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/matching.py
+"""
+
+import scipy
+import numpy as np
+from scipy.spatial.distance import cdist
+from ..motion import kalman_filter
+
+__all__ = [
+    'merge_matches',
+    'linear_assignment',
+    'cython_bbox_ious',
+    'iou_distance',
+    'embedding_distance',
+    'fuse_motion',
+]
+
+
+def merge_matches(m1, m2, shape):
+    O, P, Q = shape
+    m1 = np.asarray(m1)
+    m2 = np.asarray(m2)
+
+    M1 = scipy.sparse.coo_matrix(
+        (np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))
+    M2 = scipy.sparse.coo_matrix(
+        (np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))
+
+    mask = M1 * M2
+    match = mask.nonzero()
+    match = list(zip(match[0], match[1]))
+    unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))
+    unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))
+
+    return match, unmatched_O, unmatched_Q
+
+
+def linear_assignment(cost_matrix, thresh):
+    if cost_matrix.size == 0:
+        return np.empty(
+            (0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(
+                range(cost_matrix.shape[1]))
+    matches, unmatched_a, unmatched_b = [], [], []
+    import lap
+    cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
+    for ix, mx in enumerate(x):
+        if mx >= 0:
+            matches.append([ix, mx])
+    unmatched_a = np.where(x < 0)[0]
+    unmatched_b = np.where(y < 0)[0]
+    matches = np.asarray(matches)
+    return matches, unmatched_a, unmatched_b
+
+
+def cython_bbox_ious(atlbrs, btlbrs):
+    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
+    if ious.size == 0:
+        return ious
+    import cython_bbox
+    ious = cython_bbox.bbox_overlaps(
+        np.ascontiguousarray(
+            atlbrs, dtype=np.float),
+        np.ascontiguousarray(
+            btlbrs, dtype=np.float))
+    return ious
+
+
+def iou_distance(atracks, btracks):
+    """
+    Compute cost based on IoU between two list[STrack].
+    """
+    if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) or (
+            len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
+        atlbrs = atracks
+        btlbrs = btracks
+    else:
+        atlbrs = [track.tlbr for track in atracks]
+        btlbrs = [track.tlbr for track in btracks]
+    _ious = cython_bbox_ious(atlbrs, btlbrs)
+    cost_matrix = 1 - _ious
+
+    return cost_matrix
+
+
+def embedding_distance(tracks, detections, metric='euclidean'):
+    """
+    Compute cost based on features between two list[STrack].
+    """
+    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
+    if cost_matrix.size == 0:
+        return cost_matrix
+    det_features = np.asarray(
+        [track.curr_feat for track in detections], dtype=np.float)
+    track_features = np.asarray(
+        [track.smooth_feat for track in tracks], dtype=np.float)
+    cost_matrix = np.maximum(0.0, cdist(track_features, det_features,
+                                        metric))  # Nomalized features
+    return cost_matrix
+
+
+def fuse_motion(kf,
+                cost_matrix,
+                tracks,
+                detections,
+                only_position=False,
+                lambda_=0.98):
+    if cost_matrix.size == 0:
+        return cost_matrix
+    gating_dim = 2 if only_position else 4
+    gating_threshold = kalman_filter.chi2inv95[gating_dim]
+    measurements = np.asarray([det.to_xyah() for det in detections])
+    for row, track in enumerate(tracks):
+        gating_distance = kf.gating_distance(
+            track.mean,
+            track.covariance,
+            measurements,
+            only_position,
+            metric='maha')
+        cost_matrix[row, gating_distance > gating_threshold] = np.inf
+        cost_matrix[row] = lambda_ * cost_matrix[row] + (1 - lambda_
+                                                         ) * gating_distance
+    return cost_matrix
--- a/ppdet/modeling/mot/motion/__init__.py
+++ b/ppdet/modeling/mot/motion/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import kalman_filter
+
+from .kalman_filter import *
--- a/ppdet/modeling/mot/motion/kalman_filter.py
+++ b/ppdet/modeling/mot/motion/kalman_filter.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is borrow from https://github.com/nwojke/deep_sort/blob/master/deep_sort/kalman_filter.py
+"""
+
+import numpy as np
+import scipy.linalg
+from ppdet.core.workspace import register, serializable
+
+__all__ = ['KalmanFilter']
+"""
+Table for the 0.95 quantile of the chi-square distribution with N degrees of
+freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv
+function and used as Mahalanobis gating threshold.
+"""
+
+chi2inv95 = {
+    1: 3.8415,
+    2: 5.9915,
+    3: 7.8147,
+    4: 9.4877,
+    5: 11.070,
+    6: 12.592,
+    7: 14.067,
+    8: 15.507,
+    9: 16.919
+}
+
+
+@register
+@serializable
+class KalmanFilter(object):
+    """
+    A simple Kalman filter for tracking bounding boxes in image space.
+
+    The 8-dimensional state space
+
+        x, y, a, h, vx, vy, va, vh
+
+    contains the bounding box center position (x, y), aspect ratio a, height h,
+    and their respective velocities.
+
+    Object motion follows a constant velocity model. The bounding box location
+    (x, y, a, h) is taken as direct observation of the state space (linear
+    observation model).
+
+    """
+
+    def __init__(self):
+        ndim, dt = 4, 1.
+
+        # Create Kalman filter model matrices.
+        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
+        for i in range(ndim):
+            self._motion_mat[i, ndim + i] = dt
+        self._update_mat = np.eye(ndim, 2 * ndim)
+
+        # Motion and observation uncertainty are chosen relative to the current
+        # state estimate. These weights control the amount of uncertainty in
+        # the model. This is a bit hacky.
+        self._std_weight_position = 1. / 20
+        self._std_weight_velocity = 1. / 160
+
+    def initiate(self, measurement):
+        """
+        Create track from unassociated measurement.
+
+        Args:
+            measurement (ndarray): Bounding box coordinates (x, y, a, h) with
+                center position (x, y), aspect ratio a, and height h.
+
+        Returns:
+            The mean vector (8 dimensional) and covariance matrix (8x8
+            dimensional) of the new track. Unobserved velocities are 
+            initialized to 0 mean.
+        """
+        mean_pos = measurement
+        mean_vel = np.zeros_like(mean_pos)
+        mean = np.r_[mean_pos, mean_vel]
+
+        std = [
+            2 * self._std_weight_position * measurement[3],
+            2 * self._std_weight_position * measurement[3], 1e-2,
+            2 * self._std_weight_position * measurement[3],
+            10 * self._std_weight_velocity * measurement[3],
+            10 * self._std_weight_velocity * measurement[3], 1e-5,
+            10 * self._std_weight_velocity * measurement[3]
+        ]
+        covariance = np.diag(np.square(std))
+        return mean, covariance
+
+    def predict(self, mean, covariance):
+        """
+        Run Kalman filter prediction step.
+
+        Args:
+            mean (ndarray): The 8 dimensional mean vector of the object state
+                at the previous time step.
+            covariance (ndarray): The 8x8 dimensional covariance matrix of the
+                object state at the previous time step.
+
+        Returns:
+            The mean vector and covariance matrix of the predicted state. 
+            Unobserved velocities are initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[3], self._std_weight_position *
+            mean[3], 1e-2, self._std_weight_position * mean[3]
+        ]
+        std_vel = [
+            self._std_weight_velocity * mean[3], self._std_weight_velocity *
+            mean[3], 1e-5, self._std_weight_velocity * mean[3]
+        ]
+        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
+
+        #mean = np.dot(self._motion_mat, mean)
+        mean = np.dot(mean, self._motion_mat.T)
+        covariance = np.linalg.multi_dot(
+            (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
+
+        return mean, covariance
+
+    def project(self, mean, covariance):
+        """
+        Project state distribution to measurement space.
+
+        Args
+            mean (ndarray): The state's mean vector (8 dimensional array).
+            covariance (ndarray): The state's covariance matrix (8x8 dimensional).
+
+        Returns:
+            The projected mean and covariance matrix of the given state estimate.
+        """
+        std = [
+            self._std_weight_position * mean[3], self._std_weight_position *
+            mean[3], 1e-1, self._std_weight_position * mean[3]
+        ]
+        innovation_cov = np.diag(np.square(std))
+
+        mean = np.dot(self._update_mat, mean)
+        covariance = np.linalg.multi_dot((self._update_mat, covariance,
+                                          self._update_mat.T))
+        return mean, covariance + innovation_cov
+
+    def multi_predict(self, mean, covariance):
+        """
+        Run Kalman filter prediction step (Vectorized version).
+        
+        Args:
+            mean (ndarray): The Nx8 dimensional mean matrix of the object states
+                at the previous time step.
+            covariance (ndarray): The Nx8x8 dimensional covariance matrics of the
+                object states at the previous time step.
+
+        Returns:
+            The mean vector and covariance matrix of the predicted state.
+            Unobserved velocities are initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[:, 3], self._std_weight_position *
+            mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]),
+            self._std_weight_position * mean[:, 3]
+        ]
+        std_vel = [
+            self._std_weight_velocity * mean[:, 3], self._std_weight_velocity *
+            mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]),
+            self._std_weight_velocity * mean[:, 3]
+        ]
+        sqr = np.square(np.r_[std_pos, std_vel]).T
+
+        motion_cov = []
+        for i in range(len(mean)):
+            motion_cov.append(np.diag(sqr[i]))
+        motion_cov = np.asarray(motion_cov)
+
+        mean = np.dot(mean, self._motion_mat.T)
+        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))
+        covariance = np.dot(left, self._motion_mat.T) + motion_cov
+
+        return mean, covariance
+
+    def update(self, mean, covariance, measurement):
+        """
+        Run Kalman filter correction step.
+
+        Args:
+            mean (ndarray): The predicted state's mean vector (8 dimensional).
+            covariance (ndarray): The state's covariance matrix (8x8 dimensional).
+            measurement (ndarray): The 4 dimensional measurement vector
+                (x, y, a, h), where (x, y) is the center position, a the aspect
+                ratio, and h the height of the bounding box.
+
+        Returns:
+            The measurement-corrected state distribution.
+        """
+        projected_mean, projected_cov = self.project(mean, covariance)
+
+        chol_factor, lower = scipy.linalg.cho_factor(
+            projected_cov, lower=True, check_finite=False)
+        kalman_gain = scipy.linalg.cho_solve(
+            (chol_factor, lower),
+            np.dot(covariance, self._update_mat.T).T,
+            check_finite=False).T
+        innovation = measurement - projected_mean
+
+        new_mean = mean + np.dot(innovation, kalman_gain.T)
+        new_covariance = covariance - np.linalg.multi_dot(
+            (kalman_gain, projected_cov, kalman_gain.T))
+        return new_mean, new_covariance
+
+    def gating_distance(self,
+                        mean,
+                        covariance,
+                        measurements,
+                        only_position=False,
+                        metric='maha'):
+        """
+        Compute gating distance between state distribution and measurements.
+        A suitable distance threshold can be obtained from `chi2inv95`. If
+        `only_position` is False, the chi-square distribution has 4 degrees of
+        freedom, otherwise 2.
+        
+        Args:
+            mean (ndarray): Mean vector over the state distribution (8
+                dimensional).
+            covariance (ndarray): Covariance of the state distribution (8x8
+                dimensional).
+            measurements (ndarray): An Nx4 dimensional matrix of N measurements,
+                each in format (x, y, a, h) where (x, y) is the bounding box center
+                position, a the aspect ratio, and h the height.
+            only_position (Optional[bool]): If True, distance computation is 
+                done with respect to the bounding box center position only.
+            metric (str): Metric type, 'gaussian' or 'maha'.
+
+        Returns
+            An array of length N, where the i-th element contains the squared
+            Mahalanobis distance between (mean, covariance) and `measurements[i]`.
+        """
+        mean, covariance = self.project(mean, covariance)
+        if only_position:
+            mean, covariance = mean[:2], covariance[:2, :2]
+            measurements = measurements[:, :2]
+
+        d = measurements - mean
+        if metric == 'gaussian':
+            return np.sum(d * d, axis=1)
+        elif metric == 'maha':
+            cholesky_factor = np.linalg.cholesky(covariance)
+            z = scipy.linalg.solve_triangular(
+                cholesky_factor,
+                d.T,
+                lower=True,
+                check_finite=False,
+                overwrite_b=True)
+            squared_maha = np.sum(z * z, axis=0)
+            return squared_maha
+        else:
+            raise ValueError('invalid distance metric')
--- a/ppdet/modeling/mot/tracker/__init__.py
+++ b/ppdet/modeling/mot/tracker/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import base_jde_tracker
+from . import base_sde_tracker
+from . import jde_tracker
+from . import deepsort_tracker
+
+from .base_jde_tracker import *
+from .base_sde_tracker import *
+from .jde_tracker import *
+from .deepsort_tracker import *
--- a/ppdet/modeling/mot/tracker/base_jde_tracker.py
+++ b/ppdet/modeling/mot/tracker/base_jde_tracker.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is borrow from https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py
+"""
+
+import numpy as np
+from collections import deque, OrderedDict
+from ..matching import jde_matching as matching
+from ppdet.core.workspace import register, serializable
+
+__all__ = [
+    'TrackState',
+    'BaseTrack',
+    'STrack',
+    'joint_stracks',
+    'sub_stracks',
+    'remove_duplicate_stracks',
+]
+
+
+class TrackState(object):
+    New = 0
+    Tracked = 1
+    Lost = 2
+    Removed = 3
+
+
+@register
+@serializable
+class BaseTrack(object):
+    _count = 0
+
+    track_id = 0
+    is_activated = False
+    state = TrackState.New
+
+    history = OrderedDict()
+    features = []
+    curr_feature = None
+    score = 0
+    start_frame = 0
+    frame_id = 0
+    time_since_update = 0
+
+    # multi-camera
+    location = (np.inf, np.inf)
+
+    @property
+    def end_frame(self):
+        return self.frame_id
+
+    @staticmethod
+    def next_id():
+        BaseTrack._count += 1
+        return BaseTrack._count
+
+    def activate(self, *args):
+        raise NotImplementedError
+
+    def predict(self):
+        raise NotImplementedError
+
+    def update(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def mark_lost(self):
+        self.state = TrackState.Lost
+
+    def mark_removed(self):
+        self.state = TrackState.Removed
+
+
+@register
+@serializable
+class STrack(BaseTrack):
+    def __init__(self, tlwh, score, temp_feat, buffer_size=30):
+        # wait activate
+        self._tlwh = np.asarray(tlwh, dtype=np.float)
+        self.kalman_filter = None
+        self.mean, self.covariance = None, None
+        self.is_activated = False
+
+        self.score = score
+        self.tracklet_len = 0
+
+        self.smooth_feat = None
+        self.update_features(temp_feat)
+        self.features = deque([], maxlen=buffer_size)
+        self.alpha = 0.9
+
+    def update_features(self, feat):
+        feat /= np.linalg.norm(feat)
+        self.curr_feat = feat
+        if self.smooth_feat is None:
+            self.smooth_feat = feat
+        else:
+            self.smooth_feat = self.alpha * self.smooth_feat + (1 - self.alpha
+                                                                ) * feat
+        self.features.append(feat)
+        self.smooth_feat /= np.linalg.norm(self.smooth_feat)
+
+    def predict(self):
+        mean_state = self.mean.copy()
+        if self.state != TrackState.Tracked:
+            mean_state[7] = 0
+        self.mean, self.covariance = self.kalman_filter.predict(mean_state,
+                                                                self.covariance)
+
+    @staticmethod
+    def multi_predict(stracks, kalman_filter):
+        if len(stracks) > 0:
+            multi_mean = np.asarray([st.mean.copy() for st in stracks])
+            multi_covariance = np.asarray([st.covariance for st in stracks])
+            for i, st in enumerate(stracks):
+                if st.state != TrackState.Tracked:
+                    multi_mean[i][7] = 0
+            multi_mean, multi_covariance = kalman_filter.multi_predict(
+                multi_mean, multi_covariance)
+            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
+                stracks[i].mean = mean
+                stracks[i].covariance = cov
+
+    def activate(self, kalman_filter, frame_id):
+        """Start a new tracklet"""
+        self.kalman_filter = kalman_filter
+        self.track_id = self.next_id()
+        self.mean, self.covariance = self.kalman_filter.initiate(
+            self.tlwh_to_xyah(self._tlwh))
+
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        if frame_id == 1:
+            self.is_activated = True
+        self.frame_id = frame_id
+        self.start_frame = frame_id
+
+    def re_activate(self, new_track, frame_id, new_id=False):
+        self.mean, self.covariance = self.kalman_filter.update(
+            self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh))
+
+        self.update_features(new_track.curr_feat)
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        self.is_activated = True
+        self.frame_id = frame_id
+        if new_id:
+            self.track_id = self.next_id()
+
+    def update(self, new_track, frame_id, update_feature=True):
+        self.frame_id = frame_id
+        self.tracklet_len += 1
+
+        new_tlwh = new_track.tlwh
+        self.mean, self.covariance = self.kalman_filter.update(
+            self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh))
+        self.state = TrackState.Tracked
+        self.is_activated = True
+
+        self.score = new_track.score
+        if update_feature:
+            self.update_features(new_track.curr_feat)
+
+    @property
+    def tlwh(self):
+        """
+        Get current position in bounding box format `(top left x, top left y,
+        width, height)`.
+        """
+        if self.mean is None:
+            return self._tlwh.copy()
+        ret = self.mean[:4].copy()
+        ret[2] *= ret[3]
+        ret[:2] -= ret[2:] / 2
+        return ret
+
+    @property
+    def tlbr(self):
+        """
+        Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
+        `(top left, bottom right)`.
+        """
+        ret = self.tlwh.copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    @staticmethod
+    def tlwh_to_xyah(tlwh):
+        """
+        Convert bounding box to format `(center x, center y, aspect ratio,
+        height)`, where the aspect ratio is `width / height`.
+        """
+        ret = np.asarray(tlwh).copy()
+        ret[:2] += ret[2:] / 2
+        ret[2] /= ret[3]
+        return ret
+
+    def to_xyah(self):
+        return self.tlwh_to_xyah(self.tlwh)
+
+    @staticmethod
+    def tlbr_to_tlwh(tlbr):
+        ret = np.asarray(tlbr).copy()
+        ret[2:] -= ret[:2]
+        return ret
+
+    @staticmethod
+    def tlwh_to_tlbr(tlwh):
+        ret = np.asarray(tlwh).copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    def __repr__(self):
+        return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame,
+                                      self.end_frame)
+
+
+def joint_stracks(tlista, tlistb):
+    exists = {}
+    res = []
+    for t in tlista:
+        exists[t.track_id] = 1
+        res.append(t)
+    for t in tlistb:
+        tid = t.track_id
+        if not exists.get(tid, 0):
+            exists[tid] = 1
+            res.append(t)
+    return res
+
+
+def sub_stracks(tlista, tlistb):
+    stracks = {}
+    for t in tlista:
+        stracks[t.track_id] = t
+    for t in tlistb:
+        tid = t.track_id
+        if stracks.get(tid, 0):
+            del stracks[tid]
+    return list(stracks.values())
+
+
+def remove_duplicate_stracks(stracksa, stracksb):
+    pdist = matching.iou_distance(stracksa, stracksb)
+    pairs = np.where(pdist < 0.15)
+    dupa, dupb = list(), list()
+    for p, q in zip(*pairs):
+        timep = stracksa[p].frame_id - stracksa[p].start_frame
+        timeq = stracksb[q].frame_id - stracksb[q].start_frame
+        if timep > timeq:
+            dupb.append(q)
+        else:
+            dupa.append(p)
+    resa = [t for i, t in enumerate(stracksa) if not i in dupa]
+    resb = [t for i, t in enumerate(stracksb) if not i in dupb]
+    return resa, resb
--- a/ppdet/modeling/mot/tracker/base_sde_tracker.py
+++ b/ppdet/modeling/mot/tracker/base_sde_tracker.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is borrow from https://github.com/nwojke/deep_sort/blob/master/deep_sort/track.py
+"""
+
+import numpy as np
+from ppdet.core.workspace import register, serializable
+
+__all__ = ['TrackState', 'Track']
+
+
+class TrackState(object):
+    """
+    Enumeration type for the single target track state. Newly created tracks are
+    classified as `tentative` until enough evidence has been collected. Then,
+    the track state is changed to `confirmed`. Tracks that are no longer alive
+    are classified as `deleted` to mark them for removal from the set of active
+    tracks.
+    """
+    Tentative = 1
+    Confirmed = 2
+    Deleted = 3
+
+
+@register
+@serializable
+class Track(object):
+    """
+    A single target track with state space `(x, y, a, h)` and associated
+    velocities, where `(x, y)` is the center of the bounding box, `a` is the
+    aspect ratio and `h` is the height.
+
+    Args:
+        mean (ndarray): Mean vector of the initial state distribution.
+        covariance (ndarray): Covariance matrix of the initial state distribution.
+        track_id (int): A unique track identifier.
+        n_init (int): Number of consecutive detections before the track is confirmed.
+            The track state is set to `Deleted` if a miss occurs within the first
+            `n_init` frames.
+        max_age (int): The maximum number of consecutive misses before the track
+            state is set to `Deleted`.
+        feature (Optional[ndarray]): Feature vector of the detection this track
+            originates from. If not None, this feature is added to the `features` cache.
+
+    Attributes:
+        hits (int): Total number of measurement updates.
+        age (int): Total number of frames since first occurance.
+        time_since_update (int): Total number of frames since last measurement
+            update.
+        state (TrackState): The current track state.
+        features (List[ndarray]): A cache of features. On each measurement update,
+            the associated feature vector is added to this list.
+    """
+
+    def __init__(self,
+                 mean,
+                 covariance,
+                 track_id,
+                 n_init,
+                 max_age,
+                 feature=None):
+        self.mean = mean
+        self.covariance = covariance
+        self.track_id = track_id
+        self.hits = 1
+        self.age = 1
+        self.time_since_update = 0
+
+        self.state = TrackState.Tentative
+        self.features = []
+        if feature is not None:
+            self.features.append(feature)
+
+        self._n_init = n_init
+        self._max_age = max_age
+
+    def to_tlwh(self):
+        """Get position in format `(top left x, top left y, width, height)`."""
+        ret = self.mean[:4].copy()
+        ret[2] *= ret[3]
+        ret[:2] -= ret[2:] / 2
+        return ret
+
+    def to_tlbr(self):
+        """Get position in bounding box format `(min x, miny, max x, max y)`."""
+        ret = self.to_tlwh()
+        ret[2:] = ret[:2] + ret[2:]
+        return ret
+
+    def predict(self, kalman_filter):
+        """
+        Propagate the state distribution to the current time step using a Kalman
+        filter prediction step.
+        """
+        self.mean, self.covariance = kalman_filter.predict(self.mean,
+                                                           self.covariance)
+        self.age += 1
+        self.time_since_update += 1
+
+    def update(self, kalman_filter, detection):
+        """
+        Perform Kalman filter measurement update step and update the associated
+        detection feature cache.
+        """
+        self.mean, self.covariance = kalman_filter.update(self.mean,
+                                                          self.covariance,
+                                                          detection.to_xyah())
+        self.features.append(detection.feature)
+
+        self.hits += 1
+        self.time_since_update = 0
+        if self.state == TrackState.Tentative and self.hits >= self._n_init:
+            self.state = TrackState.Confirmed
+
+    def mark_missed(self):
+        """Mark this track as missed (no association at the current time step).
+        """
+        if self.state == TrackState.Tentative:
+            self.state = TrackState.Deleted
+        elif self.time_since_update > self._max_age:
+            self.state = TrackState.Deleted
+
+    def is_tentative(self):
+        """Returns True if this track is tentative (unconfirmed)."""
+        return self.state == TrackState.Tentative
+
+    def is_confirmed(self):
+        """Returns True if this track is confirmed."""
+        return self.state == TrackState.Confirmed
+
+    def is_deleted(self):
+        """Returns True if this track is dead and should be deleted."""
+        return self.state == TrackState.Deleted
--- a/ppdet/modeling/mot/tracker/deepsort_tracker.py
+++ b/ppdet/modeling/mot/tracker/deepsort_tracker.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is borrow from https://github.com/nwojke/deep_sort/blob/master/deep_sort/tracker.py
+"""
+
+import numpy as np
+
+from ..matching.deepsort_matching import NearestNeighborDistanceMetric
+from ..matching.deepsort_matching import iou_cost, min_cost_matching, matching_cascade, gate_cost_matrix
+from .base_sde_tracker import Track
+
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['DeepSORTTracker']
+
+
+@register
+@serializable
+class DeepSORTTracker(object):
+    __inject__ = ['motion']
+    """
+    DeepSORT tracker
+
+    Args:
+        img_size (list): input image size, [h, w]
+        budget (int): If not None, fix samples per class to at most this number.
+            Removes the oldest samples when the budget is reached.
+        max_age (int): maximum number of missed misses before a track is deleted
+        n_init (float): Number of frames that a track remains in initialization
+            phase. Number of consecutive detections before the track is confirmed. 
+            The track state is set to `Deleted` if a miss occurs within the first 
+            `n_init` frames.
+        metric_type (str): either "euclidean" or "cosine", the distance metric 
+            used for measurement to track association.
+        matching_threshold (float): samples with larger distance are 
+            considered an invalid match.
+        max_iou_distance (float): max iou distance threshold
+        motion (object): KalmanFilter instance
+    """
+
+    def __init__(self,
+                 img_size=[608, 1088],
+                 budget=100,
+                 max_age=30,
+                 n_init=3,
+                 metric_type='cosine',
+                 matching_threshold=0.2,
+                 max_iou_distance=0.7,
+                 motion='KalmanFilter'):
+        self.img_size = img_size
+        self.max_age = max_age
+        self.n_init = n_init
+        self.metric = NearestNeighborDistanceMetric(metric_type,
+                                                    matching_threshold, budget)
+        self.max_iou_distance = max_iou_distance
+        self.motion = motion
+
+        self.tracks = []
+        self._next_id = 1
+
+    def predict(self):
+        """
+        Propagate track state distributions one time step forward.
+        This function should be called once every time step, before `update`.
+        """
+        for track in self.tracks:
+            track.predict(self.motion)
+
+    def update(self, detections):
+        """
+        Perform measurement update and track management.
+        Args:
+            detections (list): List[ppdet.modeling.mot.utils.Detection]
+            A list of detections at the current time step.
+        """
+        # Run matching cascade.
+        matches, unmatched_tracks, unmatched_detections = \
+            self._match(detections)
+
+        # Update track set.
+        for track_idx, detection_idx in matches:
+            self.tracks[track_idx].update(self.motion,
+                                          detections[detection_idx])
+        for track_idx in unmatched_tracks:
+            self.tracks[track_idx].mark_missed()
+        for detection_idx in unmatched_detections:
+            self._initiate_track(detections[detection_idx])
+        self.tracks = [t for t in self.tracks if not t.is_deleted()]
+
+        # Update distance metric.
+        active_targets = [t.track_id for t in self.tracks if t.is_confirmed()]
+        features, targets = [], []
+        for track in self.tracks:
+            if not track.is_confirmed():
+                continue
+            features += track.features
+            targets += [track.track_id for _ in track.features]
+            track.features = []
+        self.metric.partial_fit(
+            np.asarray(features), np.asarray(targets), active_targets)
+        output_stracks = self.tracks
+        return output_stracks
+
+    def _match(self, detections):
+        def gated_metric(tracks, dets, track_indices, detection_indices):
+            features = np.array([dets[i].feature for i in detection_indices])
+            targets = np.array([tracks[i].track_id for i in track_indices])
+            cost_matrix = self.metric.distance(features, targets)
+            cost_matrix = gate_cost_matrix(self.motion, cost_matrix, tracks,
+                                           dets, track_indices,
+                                           detection_indices)
+            return cost_matrix
+
+        # Split track set into confirmed and unconfirmed tracks.
+        confirmed_tracks = [
+            i for i, t in enumerate(self.tracks) if t.is_confirmed()
+        ]
+        unconfirmed_tracks = [
+            i for i, t in enumerate(self.tracks) if not t.is_confirmed()
+        ]
+
+        # Associate confirmed tracks using appearance features.
+        matches_a, unmatched_tracks_a, unmatched_detections = \
+            matching_cascade(
+                gated_metric, self.metric.matching_threshold, self.max_age,
+                self.tracks, detections, confirmed_tracks)
+
+        # Associate remaining tracks together with unconfirmed tracks using IOU.
+        iou_track_candidates = unconfirmed_tracks + [
+            k for k in unmatched_tracks_a
+            if self.tracks[k].time_since_update == 1
+        ]
+        unmatched_tracks_a = [
+            k for k in unmatched_tracks_a
+            if self.tracks[k].time_since_update != 1
+        ]
+        matches_b, unmatched_tracks_b, unmatched_detections = \
+            min_cost_matching(
+                iou_cost, self.max_iou_distance, self.tracks,
+                detections, iou_track_candidates, unmatched_detections)
+
+        matches = matches_a + matches_b
+        unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b))
+        return matches, unmatched_tracks, unmatched_detections
+
+    def _initiate_track(self, detection):
+        mean, covariance = self.motion.initiate(detection.to_xyah())
+        self.tracks.append(
+            Track(mean, covariance, self._next_id, self.n_init, self.max_age,
+                  detection.feature))
+        self._next_id += 1
--- a/ppdet/modeling/mot/tracker/jde_tracker.py
+++ b/ppdet/modeling/mot/tracker/jde_tracker.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is borrow from https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py
+"""
+
+import paddle
+
+from ..matching import jde_matching as matching
+from .base_jde_tracker import TrackState, BaseTrack, STrack
+from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks
+
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['JDETracker']
+
+
+@register
+@serializable
+class JDETracker(object):
+    __inject__ = ['motion']
+    """
+    JDE tracker
+
+    Args:
+        det_thresh (float): threshold of detection score
+        track_buffer (int): buffer for tracker
+        min_box_area (int): min box area to filter out low quality boxes
+        tracked_thresh (float): linear assignment threshold of tracked 
+            stracks and detections
+        r_tracked_thresh (float): linear assignment threshold of 
+            tracked stracks and unmatched detections
+        unconfirmed_thresh (float): linear assignment threshold of 
+            unconfirmed stracks and unmatched detections
+        motion (object): KalmanFilter instance
+    """
+
+    def __init__(self,
+                 det_thresh=0.3,
+                 track_buffer=30,
+                 min_box_area=200,
+                 tracked_thresh=0.7,
+                 r_tracked_thresh=0.5,
+                 unconfirmed_thresh=0.7,
+                 motion='KalmanFilter'):
+        self.det_thresh = det_thresh
+        self.track_buffer = track_buffer
+        self.min_box_area = min_box_area
+        self.tracked_thresh = tracked_thresh
+        self.r_tracked_thresh = r_tracked_thresh
+        self.unconfirmed_thresh = unconfirmed_thresh
+        self.motion = motion
+
+        self.frame_id = 0
+        self.tracked_stracks = []
+        self.lost_stracks = []
+        self.removed_stracks = []
+
+        self.max_time_lost = 0
+        # max_time_lost will be calculated: int(frame_rate / 30.0 * track_buffer)
+
+    def update(self, pred_dets, pred_embs):
+        """
+        Processes the image frame and finds bounding box(detections).
+        Associates the detection with corresponding tracklets and also handles
+            lost, removed, refound and active tracklets.
+
+        Args:
+            pred_dets (Tensor): Detection results of the image, shape is [N, 5].
+            pred_embs (Tensor): Embedding results of the image, shape is [N, 512].
+
+        Return:
+            output_stracks (list): The list contains information regarding the
+                online_tracklets for the recieved image tensor.
+        """
+        self.frame_id += 1
+        activated_starcks = []
+        # for storing active tracks, for the current frame
+        refind_stracks = []
+        # Lost Tracks whose detections are obtained in the current frame
+        lost_stracks = []
+        # The tracks which are not obtained in the current frame but are not 
+        # removed. (Lost for some time lesser than the threshold for removing)
+        removed_stracks = []
+
+        # Filter out the image with box_num = 0. pred_dets = [[0.0, 0.0, 0.0 ,0.0]]
+        empty_pred = True if len(pred_dets) == 1 and paddle.sum(
+            pred_dets) == 0.0 else False
+        """ Step 1: Network forward, get detections & embeddings"""
+        if len(pred_dets) > 0 and not empty_pred:
+            pred_dets = pred_dets.numpy()
+            pred_embs = pred_embs.numpy()
+            detections = [
+                STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f, 30)
+                for (tlbrs, f) in zip(pred_dets, pred_embs)
+            ]
+        else:
+            detections = []
+        ''' Add newly detected tracklets to tracked_stracks'''
+        unconfirmed = []
+        tracked_stracks = []  # type: list[STrack]
+        for track in self.tracked_stracks:
+            if not track.is_activated:
+                # previous tracks which are not active in the current frame are added in unconfirmed list
+                unconfirmed.append(track)
+            else:
+                # Active tracks are added to the local list 'tracked_stracks'
+                tracked_stracks.append(track)
+        """ Step 2: First association, with embedding"""
+        # Combining currently tracked_stracks and lost_stracks
+        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)
+        # Predict the current location with KF
+        STrack.multi_predict(strack_pool, self.motion)
+
+        dists = matching.embedding_distance(strack_pool, detections)
+        dists = matching.fuse_motion(self.motion, dists, strack_pool,
+                                     detections)
+        # The dists is the list of distances of the detection with the tracks in strack_pool
+        matches, u_track, u_detection = matching.linear_assignment(
+            dists, thresh=self.tracked_thresh)
+        # The matches is the array for corresponding matches of the detection with the corresponding strack_pool
+
+        for itracked, idet in matches:
+            # itracked is the id of the track and idet is the detection
+            track = strack_pool[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                # If the track is active, add the detection to the track
+                track.update(detections[idet], self.frame_id)
+                activated_starcks.append(track)
+            else:
+                # We have obtained a detection from a track which is not active,
+                # hence put the track in refind_stracks list
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+
+        # None of the steps below happen if there are no undetected tracks.
+        """ Step 3: Second association, with IOU"""
+        detections = [detections[i] for i in u_detection]
+        # detections is now a list of the unmatched detections
+        r_tracked_stracks = []
+        # This is container for stracks which were tracked till the previous
+        # frame but no detection was found for it in the current frame.
+
+        for i in u_track:
+            if strack_pool[i].state == TrackState.Tracked:
+                r_tracked_stracks.append(strack_pool[i])
+        dists = matching.iou_distance(r_tracked_stracks, detections)
+        matches, u_track, u_detection = matching.linear_assignment(
+            dists, thresh=self.r_tracked_thresh)
+        # matches is the list of detections which matched with corresponding
+        # tracks by IOU distance method.
+
+        for itracked, idet in matches:
+            track = r_tracked_stracks[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(det, self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+        # Same process done for some unmatched detections, but now considering IOU_distance as measure
+
+        for it in u_track:
+            track = r_tracked_stracks[it]
+            if not track.state == TrackState.Lost:
+                track.mark_lost()
+                lost_stracks.append(track)
+        # If no detections are obtained for tracks (u_track), the tracks are added to lost_tracks list and are marked lost
+        '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
+        detections = [detections[i] for i in u_detection]
+        dists = matching.iou_distance(unconfirmed, detections)
+        matches, u_unconfirmed, u_detection = matching.linear_assignment(
+            dists, thresh=self.unconfirmed_thresh)
+        for itracked, idet in matches:
+            unconfirmed[itracked].update(detections[idet], self.frame_id)
+            activated_starcks.append(unconfirmed[itracked])
+
+        # The tracks which are yet not matched
+        for it in u_unconfirmed:
+            track = unconfirmed[it]
+            track.mark_removed()
+            removed_stracks.append(track)
+
+        # after all these confirmation steps, if a new detection is found, it is initialized for a new track
+        """ Step 4: Init new stracks"""
+        for inew in u_detection:
+            track = detections[inew]
+            if track.score < self.det_thresh:
+                continue
+            track.activate(self.motion, self.frame_id)
+            activated_starcks.append(track)
+        """ Step 5: Update state"""
+        # If the tracks are lost for more frames than the threshold number, the tracks are removed.
+        for track in self.lost_stracks:
+            if self.frame_id - track.end_frame > self.max_time_lost:
+                track.mark_removed()
+                removed_stracks.append(track)
+
+        # Update the self.tracked_stracks and self.lost_stracks using the updates in this step.
+        self.tracked_stracks = [
+            t for t in self.tracked_stracks if t.state == TrackState.Tracked
+        ]
+        self.tracked_stracks = joint_stracks(self.tracked_stracks,
+                                             activated_starcks)
+        self.tracked_stracks = joint_stracks(self.tracked_stracks,
+                                             refind_stracks)
+
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
+        self.lost_stracks.extend(lost_stracks)
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
+        self.removed_stracks.extend(removed_stracks)
+        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(
+            self.tracked_stracks, self.lost_stracks)
+        # get scores of lost tracks
+        output_stracks = [
+            track for track in self.tracked_stracks if track.is_activated
+        ]
+
+        logger.debug('===========Frame {}=========='.format(self.frame_id))
+        logger.debug('Activated: {}'.format(
+            [track.track_id for track in activated_starcks]))
+        logger.debug('Refind: {}'.format(
+            [track.track_id for track in refind_stracks]))
+        logger.debug('Lost: {}'.format(
+            [track.track_id for track in lost_stracks]))
+        logger.debug('Removed: {}'.format(
+            [track.track_id for track in removed_stracks]))
+
+        return output_stracks
--- a/ppdet/modeling/mot/utils.py
+++ b/ppdet/modeling/mot/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import time
+import paddle
+import numpy as np
+
+__all__ = [
+    'Timer',
+    'Detection',
+    'load_det_results',
+    'preprocess_reid',
+    'get_crops',
+    'clip_box',
+    'scale_coords',
+]
+
+
+class Timer(object):
+    """
+    This class used to compute and print the current FPS while evaling.
+    """
+
+    def __init__(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+        self.duration = 0.
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.diff = time.time() - self.start_time
+        self.total_time += self.diff
+        self.calls += 1
+        self.average_time = self.total_time / self.calls
+        if average:
+            self.duration = self.average_time
+        else:
+            self.duration = self.diff
+        return self.duration
+
+    def clear(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+        self.duration = 0.
+
+
+class Detection(object):
+    """
+    This class represents a bounding box detection in a single image.
+
+    Args:
+        tlwh (ndarray): Bounding box in format `(top left x, top left y,
+            width, height)`.
+        confidence (ndarray): Detector confidence score.
+        feature (Tensor): A feature vector that describes the object 
+            contained in this image.
+    """
+
+    def __init__(self, tlwh, confidence, feature):
+        self.tlwh = np.asarray(tlwh, dtype=np.float32)
+        self.confidence = np.asarray(confidence, dtype=np.float32)
+        self.feature = feature.numpy()
+
+    def to_tlbr(self):
+        """
+        Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
+        `(top left, bottom right)`.
+        """
+        ret = self.tlwh.copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    def to_xyah(self):
+        """
+        Convert bounding box to format `(center x, center y, aspect ratio,
+        height)`, where the aspect ratio is `width / height`.
+        """
+        ret = self.tlwh.copy()
+        ret[:2] += ret[2:] / 2
+        ret[2] /= ret[3]
+        return ret
+
+
+def load_det_results(det_file, num_frames):
+    assert os.path.exists(det_file) and os.path.isfile(det_file), \
+        'Error: det_file: {} not exist or not a file.'.format(det_file)
+    labels = np.loadtxt(det_file, dtype='float32', delimiter=',')
+    results_list = []
+    for frame_i in range(0, num_frames):
+        results = {'bbox': [], 'score': []}
+        lables_with_frame = labels[labels[:, 0] == frame_i + 1]
+        for l in lables_with_frame:
+            results['bbox'].append(l[2:6])
+            results['score'].append(l[6])
+        results_list.append(results)
+    return results_list
+
+
+def scale_coords(coords, input_shape, im_shape, scale_factor):
+    im_shape = im_shape.numpy()[0]
+    ratio = scale_factor.numpy()[0][0]
+    img0_shape = [int(im_shape[0] / ratio), int(im_shape[1] / ratio)]
+
+    pad_w = (input_shape[1] - img0_shape[1] * ratio) / 2
+    pad_h = (input_shape[0] - img0_shape[0] * ratio) / 2
+    coords[:, 0::2] -= pad_w
+    coords[:, 1::2] -= pad_h
+    coords[:, 0:4] /= paddle.to_tensor(ratio)
+    coords[:, :4] = paddle.clip(coords[:, :4], min=0, max=coords[:, :4].max())
+    return coords.round()
+
+
+def clip_box(xyxy, input_shape, im_shape, scale_factor):
+    im_shape = im_shape.numpy()[0]
+    ratio = scale_factor.numpy()[0][0]
+    img0_shape = [int(im_shape[0] / ratio), int(im_shape[1] / ratio)]
+
+    xyxy[:, 0::2] = paddle.clip(xyxy[:, 0::2], min=0, max=img0_shape[1])
+    xyxy[:, 1::2] = paddle.clip(xyxy[:, 1::2], min=0, max=img0_shape[0])
+    return xyxy
+
+
+def get_crops(xyxy, ori_img, pred_scores, w, h):
+    crops = []
+    keep_scores = []
+    xyxy = xyxy.numpy().astype(np.int64)
+    ori_img = ori_img.numpy()
+    ori_img = np.squeeze(ori_img, axis=0).transpose(1, 0, 2)
+    pred_scores = pred_scores.numpy()
+    for i, bbox in enumerate(xyxy):
+        if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]:
+            continue
+        crop = ori_img[bbox[0]:bbox[2], bbox[1]:bbox[3], :]
+        crops.append(crop)
+        keep_scores.append(pred_scores[i])
+    if len(crops) == 0:
+        return [], []
+    crops = preprocess_reid(crops, w, h)
+    return crops, keep_scores
+
+
+def preprocess_reid(imgs,
+                    w=64,
+                    h=192,
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225]):
+    im_batch = []
+    for img in imgs:
+        img = cv2.resize(img, (w, h))
+        img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
+        img_mean = np.array(mean).reshape((3, 1, 1))
+        img_std = np.array(std).reshape((3, 1, 1))
+        img -= img_mean
+        img /= img_std
+        img = np.expand_dims(img, axis=0)
+        im_batch.append(img)
+    im_batch = np.concatenate(im_batch, 0)
+    return im_batch
--- a/ppdet/modeling/mot/visualization.py
+++ b/ppdet/modeling/mot/visualization.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+
+
+def tlwhs_to_tlbrs(tlwhs):
+    tlbrs = np.copy(tlwhs)
+    if len(tlbrs) == 0:
+        return tlbrs
+    tlbrs[:, 2] += tlwhs[:, 0]
+    tlbrs[:, 3] += tlwhs[:, 1]
+    return tlbrs
+
+
+def get_color(idx):
+    idx = idx * 3
+    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
+    return color
+
+
+def resize_image(image, max_size=800):
+    if max(image.shape[:2]) > max_size:
+        scale = float(max_size) / max(image.shape[:2])
+        image = cv2.resize(image, None, fx=scale, fy=scale)
+    return image
+
+
+def plot_tracking(image,
+                  tlwhs,
+                  obj_ids,
+                  scores=None,
+                  frame_id=0,
+                  fps=0.,
+                  ids2=None):
+    im = np.ascontiguousarray(np.copy(image))
+    im_h, im_w = im.shape[:2]
+
+    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255
+
+    text_scale = max(1, image.shape[1] / 1600.)
+    text_thickness = 2
+    line_thickness = max(1, int(image.shape[1] / 500.))
+
+    radius = max(5, int(im_w / 140.))
+    cv2.putText(
+        im,
+        'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
+        (0, int(15 * text_scale)),
+        cv2.FONT_HERSHEY_PLAIN,
+        text_scale, (0, 0, 255),
+        thickness=2)
+
+    for i, tlwh in enumerate(tlwhs):
+        x1, y1, w, h = tlwh
+        intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
+        obj_id = int(obj_ids[i])
+        id_text = '{}'.format(int(obj_id))
+        if ids2 is not None:
+            id_text = id_text + ', {}'.format(int(ids2[i]))
+        _line_thickness = 1 if obj_id <= 0 else line_thickness
+        color = get_color(abs(obj_id))
+        cv2.rectangle(
+            im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)
+        cv2.putText(
+            im,
+            id_text, (intbox[0], intbox[1] + 30),
+            cv2.FONT_HERSHEY_PLAIN,
+            text_scale, (0, 0, 255),
+            thickness=text_thickness)
+    return im
+
+
+def plot_trajectory(image, tlwhs, track_ids):
+    image = image.copy()
+    for one_tlwhs, track_id in zip(tlwhs, track_ids):
+        color = get_color(int(track_id))
+        for tlwh in one_tlwhs:
+            x1, y1, w, h = tuple(map(int, tlwh))
+            cv2.circle(
+                image, (int(x1 + 0.5 * w), int(y1 + h)), 2, color, thickness=2)
+    return image
+
+
+def plot_detections(image, tlbrs, scores=None, color=(255, 0, 0), ids=None):
+    im = np.copy(image)
+    text_scale = max(1, image.shape[1] / 800.)
+    thickness = 2 if text_scale > 1.3 else 1
+    for i, det in enumerate(tlbrs):
+        x1, y1, x2, y2 = np.asarray(det[:4], dtype=np.int)
+        if len(det) >= 7:
+            label = 'det' if det[5] > 0 else 'trk'
+            if ids is not None:
+                text = '{}# {:.2f}: {:d}'.format(label, det[6], ids[i])
+                cv2.putText(
+                    im,
+                    text, (x1, y1 + 30),
+                    cv2.FONT_HERSHEY_PLAIN,
+                    text_scale, (0, 255, 255),
+                    thickness=thickness)
+            else:
+                text = '{}# {:.2f}'.format(label, det[6])
+
+        if scores is not None:
+            text = '{:.2f}'.format(scores[i])
+            cv2.putText(
+                im,
+                text, (x1, y1 + 30),
+                cv2.FONT_HERSHEY_PLAIN,
+                text_scale, (0, 255, 255),
+                thickness=thickness)
+
+        cv2.rectangle(im, (x1, y1), (x2, y2), color, 2)
+    return im
--- a/ppdet/modeling/necks/yolo_fpn.py
+++ b/ppdet/modeling/necks/yolo_fpn.py
@@ -52,7 +52,13 @@ def add_coord(x, data_format):


 class YoloDetBlock(nn.Layer):
-    def __init__(self, ch_in, channel, norm_type, name, data_format='NCHW'):
+    def __init__(self,
+                 ch_in,
+                 channel,
+                 norm_type,
+                 freeze_norm=False,
+                 name='',
+                 data_format='NCHW'):
        """
        YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767

@@ -60,6 +66,7 @@ class YoloDetBlock(nn.Layer):
            ch_in (int): input channel
            channel (int): base channel
            norm_type (str): batch norm type
+            freeze_norm (bool): whether to freeze norm, default False
            name (str): layer name
            data_format (str): data format, NCHW or NHWC
        """
@@ -87,6 +94,7 @@ class YoloDetBlock(nn.Layer):
                    filter_size=filter_size,
                    padding=(filter_size - 1) // 2,
                    norm_type=norm_type,
+                    freeze_norm=freeze_norm,
                    data_format=data_format,
                    name=name + post_name))

@@ -96,6 +104,7 @@ class YoloDetBlock(nn.Layer):
            filter_size=3,
            padding=1,
            norm_type=norm_type,
+            freeze_norm=freeze_norm,
            data_format=data_format,
            name=name + '.tip')

@@ -112,7 +121,8 @@ class SPP(nn.Layer):
                 k,
                 pool_size,
                 norm_type,
-                 name,
+                 freeze_norm=False,
+                 name='',
                 act='leaky',
                 data_format='NCHW'):
        """
@@ -123,7 +133,9 @@ class SPP(nn.Layer):
            ch_out (int): output channel of conv layer
            k (int): kernel size of conv layer
            norm_type (str): batch norm type
+            freeze_norm (bool): whether to freeze norm, default False
            name (str): layer name
+            act (str): activation function
            data_format (str): data format, NCHW or NHWC
        """
        super(SPP, self).__init__()
@@ -145,6 +157,7 @@ class SPP(nn.Layer):
            k,
            padding=k // 2,
            norm_type=norm_type,
+            freeze_norm=freeze_norm,
            name=name,
            act=act,
            data_format=data_format)
@@ -210,7 +223,8 @@ class CoordConv(nn.Layer):
                 filter_size,
                 padding,
                 norm_type,
-                 name,
+                 freeze_norm=False,
+                 name='',
                 data_format='NCHW'):
        """
        CoordConv layer
@@ -232,6 +246,7 @@ class CoordConv(nn.Layer):
            filter_size=filter_size,
            padding=padding,
            norm_type=norm_type,
+            freeze_norm=freeze_norm,
            data_format=data_format,
            name=name)
        self.data_format = data_format
@@ -419,6 +434,7 @@ class YOLOv3FPN(nn.Layer):
    def __init__(self,
                 in_channels=[256, 512, 1024],
                 norm_type='bn',
+                 freeze_norm=False,
                 data_format='NCHW'):
        """
        YOLOv3FPN layer
@@ -449,6 +465,7 @@ class YOLOv3FPN(nn.Layer):
                    in_channel,
                    channel=512 // (2**i),
                    norm_type=norm_type,
+                    freeze_norm=freeze_norm,
                    data_format=data_format,
                    name=name))
            self.yolo_blocks.append(yolo_block)
@@ -466,14 +483,17 @@ class YOLOv3FPN(nn.Layer):
                        stride=1,
                        padding=0,
                        norm_type=norm_type,
+                        freeze_norm=freeze_norm,
                        data_format=data_format,
                        name=name))
                self.routes.append(route)

-    def forward(self, blocks):
+    def forward(self, blocks, for_mot=False):
        assert len(blocks) == self.num_blocks
        blocks = blocks[::-1]
        yolo_feats = []
+        if for_mot:
+            emb_feats = []
        for i, block in enumerate(blocks):
            if i > 0:
                if self.data_format == 'NCHW':
@@ -483,12 +503,19 @@ class YOLOv3FPN(nn.Layer):
            route, tip = self.yolo_blocks[i](block)
            yolo_feats.append(tip)

+            if for_mot:
+                # add emb_feats output
+                emb_feats.append(route)
+
            if i < self.num_blocks - 1:
                route = self.routes[i](route)
                route = F.interpolate(
                    route, scale_factor=2., data_format=self.data_format)

-        return yolo_feats
+        if for_mot:
+            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
+        else:
+            return yolo_feats

    @classmethod
    def from_config(cls, cfg, input_shape):
@@ -507,6 +534,7 @@ class PPYOLOFPN(nn.Layer):
    def __init__(self,
                 in_channels=[512, 1024, 2048],
                 norm_type='bn',
+                 freeze_norm=False,
                 data_format='NCHW',
                 coord_conv=False,
                 conv_block_num=2,
@@ -568,22 +596,26 @@ class PPYOLOFPN(nn.Layer):
                    [
                        'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1],
                        dict(
-                            padding=0, norm_type=norm_type)
+                            padding=0,
+                            norm_type=norm_type,
+                            freeze_norm=freeze_norm)
                    ],
                    [
                        'conv{}'.format(2 * j + 1), ConvBNLayer,
                        [c_out, c_out * 2, 3], dict(
-                            padding=1, norm_type=norm_type)
+                            padding=1,
+                            norm_type=norm_type,
+                            freeze_norm=freeze_norm)
                    ],
                ]
                c_in, c_out = c_out * 2, c_out

            base_cfg += [[
                'route', ConvLayer, [c_in, c_out, 1], dict(
-                    padding=0, norm_type=norm_type)
+                    padding=0, norm_type=norm_type, freeze_norm=freeze_norm)
            ], [
                'tip', ConvLayer, [c_out, c_out * 2, 3], dict(
-                    padding=1, norm_type=norm_type)
+                    padding=1, norm_type=norm_type, freeze_norm=freeze_norm)
            ]]

            if self.conv_block_num == 2:
@@ -591,7 +623,9 @@ class PPYOLOFPN(nn.Layer):
                    if self.spp:
                        spp_cfg = [[
                            'spp', SPP, [channel * 4, channel, 1], dict(
-                                pool_size=[5, 9, 13], norm_type=norm_type)
+                                pool_size=[5, 9, 13],
+                                norm_type=norm_type,
+                                freeze_norm=freeze_norm)
                        ]]
                    else:
                        spp_cfg = []
@@ -603,7 +637,9 @@ class PPYOLOFPN(nn.Layer):
                if self.spp and i == 0:
                    spp_cfg = [[
                        'spp', SPP, [c_in * 4, c_in, 1], dict(
-                            pool_size=[5, 9, 13], norm_type=norm_type)
+                            pool_size=[5, 9, 13],
+                            norm_type=norm_type,
+                            freeze_norm=freeze_norm)
                    ]]
                else:
                    spp_cfg = []
@@ -623,14 +659,17 @@ class PPYOLOFPN(nn.Layer):
                        stride=1,
                        padding=0,
                        norm_type=norm_type,
+                        freeze_norm=freeze_norm,
                        data_format=data_format,
                        name=name))
                self.routes.append(route)

-    def forward(self, blocks):
+    def forward(self, blocks, for_mot=False):
        assert len(blocks) == self.num_blocks
        blocks = blocks[::-1]
        yolo_feats = []
+        if for_mot:
+            emb_feats = []
        for i, block in enumerate(blocks):
            if i > 0:
                if self.data_format == 'NCHW':
@@ -640,12 +679,19 @@ class PPYOLOFPN(nn.Layer):
            route, tip = self.yolo_blocks[i](block)
            yolo_feats.append(tip)

+            if for_mot:
+                # add emb_feats output
+                emb_feats.append(route)
+
            if i < self.num_blocks - 1:
                route = self.routes[i](route)
                route = F.interpolate(
                    route, scale_factor=2., data_format=self.data_format)

-        return yolo_feats
+        if for_mot:
+            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
+        else:
+            return yolo_feats

    @classmethod
    def from_config(cls, cfg, input_shape):

--- a/ppdet/modeling/ops.py
+++ b/ppdet/modeling/ops.py
@@ -52,6 +52,7 @@ def mish(x):
 def batch_norm(ch,
               norm_type='bn',
               norm_decay=0.,
+               freeze_norm=False,
               initializer=None,
               data_format='NCHW'):
    if norm_type == 'sync_bn':
@@ -59,13 +60,30 @@ def batch_norm(ch,
    else:
        batch_norm = nn.BatchNorm2D

-    return batch_norm(
+    norm_lr = 0. if freeze_norm else 1.
+    weight_attr = ParamAttr(
+        initializer=initializer,
+        learning_rate=norm_lr,
+        regularizer=L2Decay(norm_decay),
+        trainable=False if freeze_norm else True)
+    bias_attr = ParamAttr(
+        learning_rate=norm_lr,
+        regularizer=L2Decay(norm_decay),
+        trainable=False if freeze_norm else True)
+
+    norm_layer = batch_norm(
        ch,
-        weight_attr=ParamAttr(
-            initializer=initializer, regularizer=L2Decay(norm_decay)),
-        bias_attr=ParamAttr(regularizer=L2Decay(norm_decay)),
+        weight_attr=weight_attr,
+        bias_attr=bias_attr,
        data_format=data_format)

+    norm_params = norm_layer.parameters()
+    if freeze_norm:
+        for param in norm_params:
+            param.stop_gradient = True
+
+    return norm_layer
+

 @paddle.jit.not_to_static
 def roi_pool(input,

--- a/ppdet/modeling/post_process.py
+++ b/ppdet/modeling/post_process.py
@@ -18,13 +18,18 @@ import paddle.nn as nn
 import paddle.nn.functional as F
 from ppdet.core.workspace import register
 from ppdet.modeling.bbox_utils import nonempty_bbox, rbox2poly, pd_rbox2poly
-from . import ops
 try:
    from collections.abc import Sequence
 except Exception:
    from collections import Sequence

-__all__ = ['BBoxPostProcess', 'MaskPostProcess', 'FCOSPostProcess']
+__all__ = [
+    'BBoxPostProcess',
+    'MaskPostProcess',
+    'FCOSPostProcess',
+    'S2ANetBBoxPostProcess',
+    'JDEBBoxPostProcess',
+]


 @register
@@ -307,3 +312,33 @@ class S2ANetBBoxPostProcess(object):
        pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1)
        pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1)
        return pred_result
+
+
+@register
+class JDEBBoxPostProcess(BBoxPostProcess):
+    def __call__(self, head_out, anchors):
+        """
+        Decode the bbox and do NMS for JDE model. 
+
+        Args:
+            head_out (list): Bbox_pred and cls_prob of bbox_head output.
+            anchors (list): Anchors of JDE model.
+
+        Returns:
+            boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. 
+            bbox_pred (Tensor): The output is the prediction with shape [N, 6]
+                including labels, scores and bboxes.
+            bbox_num (Tensor): The number of prediction of each batch with shape [N].
+            nms_keep_idx (Tensor): The index of kept bboxes after NMS. 
+        """
+        boxes_idx, bboxes, score = self.decode(head_out, anchors)
+        bbox_pred, bbox_num, nms_keep_idx = self.nms(bboxes, score,
+                                                     self.num_classes)
+        if bbox_pred.shape[0] == 0:
+            bbox_pred = paddle.to_tensor(
+                np.array(
+                    [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32'))
+            bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
+            nms_keep_idx = paddle.to_tensor(np.array([[0]], dtype='int32'))
+
+        return boxes_idx, bbox_pred, bbox_num, nms_keep_idx
--- a/ppdet/modeling/reid/__init__.py
+++ b/ppdet/modeling/reid/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import jde_embedding_head
+from . import pyramidal_embedding
+from . import resnet
+
+from .jde_embedding_head import *
+from .pyramidal_embedding import *
+from .resnet import *
--- a/ppdet/modeling/reid/jde_embedding_head.py
+++ b/ppdet/modeling/reid/jde_embedding_head.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register
+from paddle.nn.initializer import Normal, Constant
+
+__all__ = ['JDEEmbeddingHead']
+
+
+class LossParam(nn.Layer):
+    def __init__(self, init_value=0., use_uncertainy=True):
+        super(LossParam, self).__init__()
+        self.loss_param = self.create_parameter(
+            shape=[1],
+            attr=ParamAttr(initializer=Constant(value=init_value)),
+            dtype="float32")
+
+    def forward(self, inputs):
+        out = paddle.exp(-self.loss_param) * inputs + self.loss_param
+        return out * 0.5
+
+
+@register
+class JDEEmbeddingHead(nn.Layer):
+    __shared__ = ['num_classes']
+    __inject__ = ['emb_loss', 'jde_loss']
+    """
+    JDEEmbeddingHead
+    Args:
+        num_classes(int): Number of classes. Only support one class tracking.
+        num_identifiers(int): Number of identifiers.
+        anchor_levels(int): Number of anchor levels, same as FPN levels.
+        anchor_scales(int): Number of anchor scales on each FPN level.
+        embedding_dim(int): Embedding dimension. Default: 512.
+        emb_loss(object): Instance of 'JDEEmbeddingLoss'
+        jde_loss(object): Instance of 'JDELoss'
+    """
+
+    def __init__(
+            self,
+            num_classes=1,
+            num_identifiers=1,  # defined by dataset.total_identities
+            anchor_levels=3,
+            anchor_scales=4,
+            embedding_dim=512,
+            emb_loss='JDEEmbeddingLoss',
+            jde_loss='JDELoss'):
+        super(JDEEmbeddingHead, self).__init__()
+        self.num_classes = num_classes
+        self.num_identifiers = num_identifiers
+        self.anchor_levels = anchor_levels
+        self.anchor_scales = anchor_scales
+        self.embedding_dim = embedding_dim
+        self.emb_loss = emb_loss
+        self.jde_loss = jde_loss
+
+        self.emb_scale = math.sqrt(2) * math.log(
+            self.num_identifiers - 1) if self.num_identifiers > 1 else 1
+
+        self.identify_outputs = []
+        self.loss_params_cls = []
+        self.loss_params_reg = []
+        self.loss_params_ide = []
+        for i in range(self.anchor_levels):
+            name = 'identify_output.{}'.format(i)
+            identify_output = self.add_sublayer(
+                name,
+                nn.Conv2D(
+                    in_channels=64 * (2**self.anchor_levels) // (2**i),
+                    out_channels=self.embedding_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    weight_attr=ParamAttr(name=name + '.conv.weights'),
+                    bias_attr=ParamAttr(
+                        name=name + '.conv.bias', regularizer=L2Decay(0.))))
+            self.identify_outputs.append(identify_output)
+
+            loss_p_cls = self.add_sublayer('cls.{}'.format(i), LossParam(-4.15))
+            self.loss_params_cls.append(loss_p_cls)
+            loss_p_reg = self.add_sublayer('reg.{}'.format(i), LossParam(-4.85))
+            self.loss_params_reg.append(loss_p_reg)
+            loss_p_ide = self.add_sublayer('ide.{}'.format(i), LossParam(-2.3))
+            self.loss_params_ide.append(loss_p_ide)
+
+        self.classifier = self.add_sublayer(
+            'classifier',
+            nn.Linear(
+                self.embedding_dim,
+                self.num_identifiers,
+                weight_attr=ParamAttr(
+                    learning_rate=1., initializer=Normal(
+                        mean=0.0, std=0.01)),
+                bias_attr=ParamAttr(
+                    learning_rate=2., regularizer=L2Decay(0.))))
+
+    def forward(self,
+                identify_feats,
+                targets=None,
+                loss_confs=None,
+                loss_boxes=None,
+                test_emb=False):
+        assert len(identify_feats) == self.anchor_levels
+        ide_outs = []
+        for feat, ide_head in zip(identify_feats, self.identify_outputs):
+            ide_outs.append(ide_head(feat))
+
+        if self.training:
+            assert targets != None
+            assert len(loss_confs) == len(loss_boxes) == self.anchor_levels
+            loss_ides = self.emb_loss(ide_outs, targets, self.emb_scale,
+                                      self.classifier)
+            return self.jde_loss(loss_confs, loss_boxes, loss_ides,
+                                 self.loss_params_cls, self.loss_params_reg,
+                                 self.loss_params_ide, targets)
+        else:
+            if test_emb:
+                assert targets != None
+                embs_and_gts = self.get_emb_and_gt_outs(ide_outs, targets)
+                return embs_and_gts
+            else:
+                emb_outs = self.get_emb_outs(ide_outs)
+                return emb_outs
+
+    def get_emb_and_gt_outs(self, ide_outs, targets):
+        emb_and_gts = []
+        for i, p_ide in enumerate(ide_outs):
+            t_conf = targets['tconf{}'.format(i)]
+            t_ide = targets['tide{}'.format(i)]
+
+            p_ide = p_ide.transpose((0, 2, 3, 1))
+            p_ide_flatten = paddle.reshape(p_ide, [-1, self.embedding_dim])
+
+            mask = t_conf > 0
+            mask = paddle.cast(mask, dtype="int64")
+            emb_mask = mask.max(1).flatten()
+            emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
+            if len(emb_mask_inds) > 0:
+                t_ide_flatten = paddle.reshape(t_ide.max(1), [-1, 1])
+                tids = paddle.gather(t_ide_flatten, emb_mask_inds)
+
+                embedding = paddle.gather(p_ide_flatten, emb_mask_inds)
+                embedding = self.emb_scale * F.normalize(embedding)
+                emb_and_gt = paddle.concat([embedding, tids], axis=1)
+                emb_and_gts.append(emb_and_gt)
+
+        if len(emb_and_gts) > 0:
+            return paddle.concat(emb_and_gts, axis=0)
+        else:
+            return paddle.zeros((1, self.embedding_dim + 1))
+
+    def get_emb_outs(self, ide_outs):
+        emb_outs = []
+        for i, p_ide in enumerate(ide_outs):
+            p_ide = p_ide.transpose((0, 2, 3, 1))
+
+            p_ide_repeat = paddle.tile(
+                p_ide.unsqueeze(axis=0), [1, self.anchor_scales, 1, 1, 1])
+            embedding = F.normalize(p_ide_repeat, axis=-1)
+            emb = paddle.reshape(embedding, [-1, self.embedding_dim])
+            emb_outs.append(emb)
+
+        if len(emb_outs) > 0:
+            return paddle.concat(emb_outs, axis=0)
+        else:
+            return paddle.zeros((1, self.embedding_dim))
--- a/ppdet/modeling/reid/pyramidal_embedding.py
+++ b/ppdet/modeling/reid/pyramidal_embedding.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Normal, Constant
+from paddle import ParamAttr
+from .resnet import *
+from ppdet.core.workspace import register
+
+__all__ = ['PCBPlusDropoutPyramid']
+
+
+@register
+class PCBPlusDropoutPyramid(nn.Layer):
+    def __init__(
+            self,
+            input_ch=2048,
+            num_stripes=6,  # number of sub-parts
+            used_levels=(1, 1, 1, 1, 1, 1),
+            num_classes=751,
+            last_conv_stride=1,
+            last_conv_dilation=1,
+            num_conv_out_channels=128):
+        super(PCBPlusDropoutPyramid, self).__init__()
+        self.num_stripes = num_stripes
+        self.used_levels = used_levels
+        self.num_classes = num_classes
+
+        self.num_in_each_level = [i for i in range(self.num_stripes, 0, -1)]
+        self.num_branches = sum(self.num_in_each_level)
+
+        self.base = ResNet101(
+            lr_mult=0.1,
+            last_conv_stride=last_conv_stride,
+            last_conv_dilation=last_conv_dilation)
+        self.dropout_layer = nn.Dropout(p=0.2)
+        self.pyramid_conv_list0, self.pyramid_fc_list0 = self.basic_branch(
+            num_conv_out_channels, input_ch)
+
+    def basic_branch(self, num_conv_out_channels, input_ch):
+        # the level indexes are defined from fine to coarse,
+        # the branch will contain one more part than that of its previous level
+        # the sliding step is set to 1
+        pyramid_conv_list = nn.LayerList()
+        pyramid_fc_list = nn.LayerList()
+
+        idx_levels = 0
+        for idx_branches in range(self.num_branches):
+            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
+                idx_levels += 1
+            if self.used_levels[idx_levels] == 0:
+                continue
+            pyramid_conv_list.append(
+                nn.Sequential(
+                    nn.Conv2D(input_ch, num_conv_out_channels, 1),
+                    nn.BatchNorm2D(num_conv_out_channels), nn.ReLU()))
+
+        idx_levels = 0
+        for idx_branches in range(self.num_branches):
+            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
+                idx_levels += 1
+            if self.used_levels[idx_levels] == 0:
+                continue
+            name = "Linear_branch_id_{}".format(idx_branches)
+            fc = nn.Linear(
+                in_features=num_conv_out_channels,
+                out_features=self.num_classes,
+                weight_attr=ParamAttr(
+                    name=name + "_weights",
+                    initializer=Normal(
+                        mean=0., std=0.001)),
+                bias_attr=ParamAttr(
+                    name=name + "_bias", initializer=Constant(value=0.)))
+            pyramid_fc_list.append(fc)
+        return pyramid_conv_list, pyramid_fc_list
+
+    def pyramid_forward(self, feat):
+        each_stripe_size = int(feat.shape[2] / self.num_stripes)
+
+        feat_list, logits_list = [], []
+        idx_levels = 0
+        used_branches = 0
+        for idx_branches in range(self.num_branches):
+            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
+                idx_levels += 1
+            if self.used_levels[idx_levels] == 0:
+                continue
+            idx_in_each_level = idx_branches - sum(self.num_in_each_level[
+                0:idx_levels])
+            stripe_size_in_each_level = each_stripe_size * (idx_levels + 1)
+            start = idx_in_each_level * each_stripe_size
+            end = start + stripe_size_in_each_level
+
+            k = feat.shape[-1]
+            local_feat_avgpool = F.avg_pool2d(
+                feat[:, :, start:end, :],
+                kernel_size=(stripe_size_in_each_level, k))
+            local_feat_maxpool = F.max_pool2d(
+                feat[:, :, start:end, :],
+                kernel_size=(stripe_size_in_each_level, k))
+            local_feat = local_feat_avgpool + local_feat_maxpool
+
+            local_feat = self.pyramid_conv_list0[used_branches](local_feat)
+            local_feat = paddle.reshape(
+                local_feat, shape=[local_feat.shape[0], -1])
+            feat_list.append(local_feat)
+
+            local_logits = self.pyramid_fc_list0[used_branches](
+                self.dropout_layer(local_feat))
+            logits_list.append(local_logits)
+
+            used_branches += 1
+
+        return feat_list, logits_list
+
+    def forward(self, x):
+        feat = self.base(x)
+        assert feat.shape[2] % self.num_stripes == 0
+        feat_list, logits_list = self.pyramid_forward(feat)
+        feat_out = paddle.concat(feat_list, axis=-1)
+        return feat_out
--- a/ppdet/modeling/reid/resnet.py
+++ b/ppdet/modeling/reid/resnet.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Normal
+
+__all__ = ["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 act=None,
+                 lr_mult=1.0,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+        conv_stdv = filter_size * filter_size * num_filters
+        self._conv = nn.Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=ParamAttr(
+                name=name + "_weights",
+                learning_rate=lr_mult,
+                initializer=Normal(0, math.sqrt(2. / conv_stdv))),
+            bias_attr=False,
+            data_format=data_format)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = nn.BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(bn_name + "_offset"),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance",
+            data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None,
+                 lr_mult=1.0,
+                 dilation=1,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            dilation=dilation,
+            act="relu",
+            lr_mult=lr_mult,
+            name=name + "_branch2a",
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            dilation=dilation,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            name=name + "_branch2b",
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            dilation=dilation,
+            act=None,
+            lr_mult=lr_mult,
+            name=name + "_branch2c",
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                dilation=dilation,
+                stride=stride,
+                lr_mult=lr_mult,
+                name=name + "_branch1",
+                data_format=data_format)
+        self.shortcut = shortcut
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None,
+                 data_format="NCHW"):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            name=name + "_branch2a",
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b",
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1",
+                data_format=data_format)
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet(nn.Layer):
+    def __init__(self,
+                 layers=50,
+                 lr_mult=1.0,
+                 last_conv_stride=2,
+                 last_conv_dilation=1):
+        super(ResNet, self).__init__()
+        self.layers = layers
+        self.data_format = "NCHW"
+        self.input_image_channel = 3
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+        self.conv = ConvBNLayer(
+            num_channels=self.input_image_channel,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act="relu",
+            lr_mult=lr_mult,
+            name="conv1",
+            data_format=self.data_format)
+        self.pool2d_max = nn.MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    if i != 0 or block == 0:
+                        stride = 1
+                    elif block == len(depth) - 1:
+                        stride = last_conv_stride
+                    else:
+                        stride = 2
+                    bottleneck_block = self.add_sublayer(
+                        conv_name,
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=stride,
+                            shortcut=shortcut,
+                            name=conv_name,
+                            lr_mult=lr_mult,
+                            dilation=last_conv_dilation
+                            if block == len(depth) - 1 else 1,
+                            data_format=self.data_format))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name,
+                            data_format=self.data_format))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
+
+
+def ResNet18(**args):
+    model = ResNet(layers=18, **args)
+    return model
+
+
+def ResNet34(**args):
+    model = ResNet(layers=34, **args)
+    return model
+
+
+def ResNet50(pretrained=None, **args):
+    model = ResNet(layers=50, **args)
+    if pretrained is not None:
+        if not (os.path.isdir(pretrained) or
+                os.path.exists(pretrained + '.pdparams')):
+            raise ValueError("Model pretrain path {} does not "
+                             "exists.".format(pretrained))
+        param_state_dict = paddle.load(pretrained + '.pdparams')
+        model.set_dict(param_state_dict)
+    return model
+
+
+def ResNet101(pretrained=None, **args):
+    model = ResNet(layers=101, **args)
+    if pretrained is not None:
+        if not (os.path.isdir(pretrained) or
+                os.path.exists(pretrained + '.pdparams')):
+            raise ValueError("Model pretrain path {} does not "
+                             "exists.".format(pretrained))
+        param_state_dict = paddle.load(pretrained + '.pdparams')
+        model.set_dict(param_state_dict)
+    return model
+
+
+def ResNet152(**args):
+    model = ResNet(layers=152, **args)
+    return model
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,5 @@ Cython
 pycocotools
 #xtcocotools==1.6 #only for crowdpose
 setuptools>=42.0.0
+#lap #for mot
+#motmetrics #for mot
\ No newline at end of file