[cherry-pick][MOT] fix deploy python mot infer (#5630)

* fix deploy python mot infer, fix cfgs * fix doc, test=document_fix

[cherry-pick][MOT] fix deploy python mot infer (#5630)
* fix deploy python mot infer, fix cfgs * fix doc, test=document_fix
822c7933 · Feng Ni · GitHub · 20b2f101 · 822c7933 · 822c7933
12 changed file
--- a/configs/mot/jde/_base_/jde_darknet53.yml
+++ b/configs/mot/jde/_base_/jde_darknet53.yml
@@ -53,4 +53,4 @@ JDETracker:
  det_thresh: 0.3
  track_buffer: 30
  min_box_area: 200
-  motion: KalmanFilter
+  vertical_ratio: 1.6 # for pedestrian
--- a/deploy/pphuman/config/tracker_config.yml
+++ b/deploy/pphuman/config/tracker_config.yml
@@ -11,8 +11,8 @@ JDETracker:
  conf_thres: 0.6
  low_conf_thres: 0.1
  match_thres: 0.9
-  min_box_area: 100
-  vertical_ratio: 1.6 # for pedestrian
+  min_box_area: 0
+  vertical_ratio: 0 # 1.6 for pedestrian

 DeepSORTTracker:
  input_size: [64, 192]

--- a/deploy/pptracking/python/mot/tracker/jde_tracker.py
+++ b/deploy/pptracking/python/mot/tracker/jde_tracker.py
@@ -38,7 +38,7 @@ class JDETracker(object):
        track_buffer (int): buffer for tracker
        min_box_area (int): min box area to filter out low quality boxes
        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter
-            bad results. If set <0 means no need to filter bboxes，usually set
+            bad results. If set <= 0 means no need to filter bboxes，usually set
            1.6 for pedestrian tracking.
        tracked_thresh (float): linear assignment threshold of tracked 
            stracks and detections
@@ -64,8 +64,8 @@ class JDETracker(object):
                 num_classes=1,
                 det_thresh=0.3,
                 track_buffer=30,
-                 min_box_area=200,
-                 vertical_ratio=1.6,
+                 min_box_area=0,
+                 vertical_ratio=0,
                 tracked_thresh=0.7,
                 r_tracked_thresh=0.5,
                 unconfirmed_thresh=0.7,
@@ -161,9 +161,8 @@ class JDETracker(object):
                    detections = [
                        STrack(
                            STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id,
-                            30, temp_feat)
-                        for (tlbrs, temp_feat
-                             ) in zip(pred_dets_cls, pred_embs_cls)
+                            30, temp_feat) for (tlbrs, temp_feat) in
+                        zip(pred_dets_cls, pred_embs_cls)
                    ]
            else:
                detections = []
@@ -238,15 +237,13 @@ class JDETracker(object):
                            for tlbrs in pred_dets_cls_second
                        ]
                    else:
-                        pred_embs_cls_second = pred_embs_dict[cls_id][inds_second]
+                        pred_embs_cls_second = pred_embs_dict[cls_id][
+                            inds_second]
                        detections_second = [
                            STrack(
-                                STrack.tlbr_to_tlwh(tlbrs[2:6]),
-                                tlbrs[1],
-                                cls_id,
-                                30,
-                                temp_feat)
-                            for (tlbrs, temp_feat) in zip(pred_dets_cls_second, pred_embs_cls_second)
+                                STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1],
+                                cls_id, 30, temp_feat) for (tlbrs, temp_feat) in
+                            zip(pred_dets_cls_second, pred_embs_cls_second)
                        ]
                else:
                    detections_second = []

--- a/deploy/pptracking/python/mot_jde_infer.py
+++ b/deploy/pptracking/python/mot_jde_infer.py
@@ -112,8 +112,8 @@ class JDE_Detector(Detector):
        # tracker config
        assert self.pred_config.tracker, "The exported JDE Detector model should have tracker."
        cfg = self.pred_config.tracker
-        min_box_area = cfg.get('min_box_area', 200)
-        vertical_ratio = cfg.get('vertical_ratio', 1.6)
+        min_box_area = cfg.get('min_box_area', 0.0)
+        vertical_ratio = cfg.get('vertical_ratio', 0.0)
        conf_thres = cfg.get('conf_thres', 0.0)
        tracked_thresh = cfg.get('tracked_thresh', 0.7)
        metric_type = cfg.get('metric_type', 'euclidean')
@@ -164,7 +164,7 @@ class JDE_Detector(Detector):
            repeats (int): repeats number for prediction
        Returns:
            result (dict): include 'pred_dets': np.ndarray: shape:[N,6], N: number of box,
-                            matix element:[x_min, y_min, x_max, y_max, score, class]
+                            matix element:[class, score, x_min, y_min, x_max, y_max]
                            FairMOT(JDE)'s result include 'pred_embs': np.ndarray:
                            shape: [N, 128]
        '''

--- a/deploy/pptracking/python/mot_sde_infer.py
+++ b/deploy/pptracking/python/mot_sde_infer.py
@@ -165,8 +165,8 @@ class SDE_Detector(Detector):
            # use ByteTracker
            use_byte = cfg.get('use_byte', False)
            det_thresh = cfg.get('det_thresh', 0.3)
-            min_box_area = cfg.get('min_box_area', 200)
-            vertical_ratio = cfg.get('vertical_ratio', 1.6)
+            min_box_area = cfg.get('min_box_area', 0)
+            vertical_ratio = cfg.get('vertical_ratio', 0)
            match_thres = cfg.get('match_thres', 0.9)
            conf_thres = cfg.get('conf_thres', 0.6)
            low_conf_thres = cfg.get('low_conf_thres', 0.1)
@@ -194,7 +194,7 @@ class SDE_Detector(Detector):
        return result

    def reidprocess(self, det_results, repeats=1):
-        pred_dets = det_results['boxes']
+        pred_dets = det_results['boxes']  # cls_id, score, x0, y0, x1, y1
        pred_xyxys = pred_dets[:, 2:6]

        ori_image = det_results['ori_image']
@@ -234,7 +234,7 @@ class SDE_Detector(Detector):
        return det_results

    def tracking(self, det_results):
-        pred_dets = det_results['boxes']
+        pred_dets = det_results['boxes']  # cls_id, score, x0, y0, x1, y1
        pred_embs = det_results.get('embeddings', None)

        if self.use_deepsort_tracker:

--- a/deploy/pptracking/python/tracker_config.yml
+++ b/deploy/pptracking/python/tracker_config.yml
@@ -11,8 +11,8 @@ JDETracker:
  conf_thres: 0.6
  low_conf_thres: 0.1
  match_thres: 0.9
-  min_box_area: 100
-  vertical_ratio: 1.6 # for pedestrian
+  min_box_area: 0
+  vertical_ratio: 0 # 1.6 for pedestrian

 DeepSORTTracker:
  input_size: [64, 192]

--- a/deploy/python/README.md
+++ b/deploy/python/README.md
@@ -3,27 +3,76 @@
 在PaddlePaddle中预测引擎和训练引擎底层有着不同的优化方法, 预测引擎使用了AnalysisPredictor，专门针对推理进行了优化，是基于[C++预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/native_infer.html)的Python接口，该引擎可以对模型进行多项图优化，减少不必要的内存拷贝。如果用户在部署已训练模型的过程中对性能有较高的要求，我们提供了独立于PaddleDetection的预测脚本，方便用户直接集成部署。


-主要包含两个步骤：
-
+Python端预测部署主要包含两个步骤：
 - 导出预测模型
 - 基于Python进行预测

 ## 1. 导出预测模型

-PaddleDetection在训练过程包括网络的前向和优化器相关参数，而在部署过程中，我们只需要前向参数，具体参考:[导出模型](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/deploy/EXPORT_MODEL.md)
+PaddleDetection在训练过程包括网络的前向和优化器相关参数，而在部署过程中，我们只需要前向参数，具体参考:[导出模型](../deploy/EXPORT_MODEL.md)，例如
+
+```bash
+# 导出YOLOv3检测模型
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \
+ -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams
+
+# 导出HigherHRNet(bottom-up)关键点检测模型
+python tools/export_model.py -c configs/keypoint/higherhrnet/higherhrnet_hrnet_w32_512.yml -o weights=https://paddledet.bj.bcebos.com/models/keypoint/higherhrnet_hrnet_w32_512.pdparams
+
+# 导出HRNet(top-down)关键点检测模型
+python tools/export_model.py -c configs/keypoint/hrnet/hrnet_w32_384x288.yml -o weights=https://paddledet.bj.bcebos.com/models/keypoint/hrnet_w32_384x288.pdparams
+
+# 导出FairMOT多目标跟踪模型
+python tools/export_model.py -c configs/mot/fairmot/fairmot_dla34_30e_1088x608.yml -o weights=https://paddledet.bj.bcebos.com/models/mot/fairmot_dla34_30e_1088x608.pdparams
+
+# 导出ByteTrack多目标跟踪模型(相当于只导出检测器)
+python tools/export_model.py -c configs/mot/bytetrack/detector/ppyoloe_crn_l_36e_640x640_mot17half.yml -o weights=https://paddledet.bj.bcebos.com/models/mot/ppyoloe_crn_l_36e_640x640_mot17half.pdparams
+```

 导出后目录下，包括`infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel`四个文件。

-## 2. 基于Python的预测

+## 2. 基于Python的预测

+### 2.1 通用检测
+在终端输入以下命令进行预测：
+```bash
+python deploy/python/infer.py --model_dir=./output_inference/yolov3_darknet53_270e_coco --image_file=./demo/000000014439.jpg --device=GPU
+```

+### 2.2 关键点检测
 在终端输入以下命令进行预测：
+```bash
+# keypoint top-down(HRNet)/bottom-up(HigherHRNet)单独推理，该模式下top-down模型HRNet只支持单人截图预测
+python deploy/python/keypoint_infer.py --model_dir=output_inference/hrnet_w32_384x288/ --image_file=./demo/hrnet_demo.jpg --device=GPU --threshold=0.5
+python deploy/python/keypoint_infer.py --model_dir=output_inference/higherhrnet_hrnet_w32_512/ --image_file=./demo/000000014439_640x640.jpg --device=GPU --threshold=0.5
+
+# detector 检测 + keypoint top-down模型联合部署（联合推理只支持top-down关键点模型）
+python deploy/python/det_keypoint_unite_infer.py --det_model_dir=output_inference/yolov3_darknet53_270e_coco/ --keypoint_model_dir=output_inference/hrnet_w32_384x288/ --video_file={your video name}.mp4  --device=GPU
+```
+**注意:**
+ - 关键点检测模型导出和预测具体可参照[keypoint](../../configs/keypoint/README.md)，可分别在各个模型的文档中查找具体用法；
+ - 此目录下的关键点检测部署为基础前向功能，更多关键点检测功能可使用PP-Human项目，参照[pphuman](../pphuman/README.md)；
+

+### 2.3 多目标跟踪
+在终端输入以下命令进行预测：
 ```bash
-python deploy/python/infer.py --model_dir=./output_inference/yolov3_mobilenet_v1_roadsign --image_file=./demo/road554.png --device=GPU
+# FairMOT跟踪
+python deploy/python/mot_jde_infer.py --model_dir=output_inference/fairmot_dla34_30e_1088x608 --video_file={your video name}.mp4 --device=GPU
+
+# ByteTrack跟踪
+python deploy/python/mot_sde_infer.py --model_dir=output_inference/ppyoloe_crn_l_36e_640x640_mot17half/ --tracker_config=deploy/python/tracker_config.yml --video_file={your video name}.mp4 --device=GPU --scaled=True
+
+# FairMOT多目标跟踪联合HRNet关键点检测（联合推理只支持top-down关键点模型）
+python deploy/python/mot_keypoint_unite_infer.py --mot_model_dir=output_inference/fairmot_dla34_30e_1088x608/ --keypoint_model_dir=output_inference/hrnet_w32_384x288/ --video_file={your video name}.mp4 --device=GPU
 ```

+**注意:**
+ - 多目标跟踪模型导出和预测具体可参照[mot]](../../configs/mot/README.md)，可分别在各个模型的文档中查找具体用法；
+ - 此目录下的跟踪部署为基础前向功能以及联合关键点部署，更多跟踪功能可使用PP-Human项目，参照[pphuman](../pphuman/README.md)，或PP-Tracking项目(绘制轨迹、出入口流量计数)，参照[pptracking](../pptracking/README.md)；
+
+
 参数说明如下:

 | 参数 | 是否必须|含义 |

--- a/deploy/python/mot_jde_infer.py
+++ b/deploy/python/mot_jde_infer.py
@@ -32,7 +32,7 @@ sys.path.insert(0, parent_path)

 from pptracking.python.mot import JDETracker
 from pptracking.python.mot.utils import MOTTimer, write_mot_results
-from pptracking.python.visualize import plot_tracking, plot_tracking_dict
+from pptracking.python.mot.visualize import plot_tracking_dict

 # Global dictionary
 MOT_JDE_SUPPORT_MODELS = {
@@ -54,23 +54,30 @@ class JDE_Detector(Detector):
        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
            calibration, trt_calib_mode need to set True
        cpu_threads (int): cpu threads
-        enable_mkldnn (bool): whether to open MKLDNN 
+        enable_mkldnn (bool): whether to open MKLDNN
+        output_dir (string): The path of output, default as 'output'
+        threshold (float): Score threshold of the detected bbox, default as 0.5
+        save_images (bool): Whether to save visualization image results, default as False
+        save_mot_txts (bool): Whether to save tracking results (txt), default as False
    """

-    def __init__(self,
-                 model_dir,
-                 tracker_config=None,
-                 device='CPU',
-                 run_mode='paddle',
-                 batch_size=1,
-                 trt_min_shape=1,
-                 trt_max_shape=1088,
-                 trt_opt_shape=608,
-                 trt_calib_mode=False,
-                 cpu_threads=1,
-                 enable_mkldnn=False,
-                 output_dir='output',
-                 threshold=0.5):
+    def __init__(
+            self,
+            model_dir,
+            tracker_config=None,
+            device='CPU',
+            run_mode='paddle',
+            batch_size=1,
+            trt_min_shape=1,
+            trt_max_shape=1088,
+            trt_opt_shape=608,
+            trt_calib_mode=False,
+            cpu_threads=1,
+            enable_mkldnn=False,
+            output_dir='output',
+            threshold=0.5,
+            save_images=False,
+            save_mot_txts=False, ):
        super(JDE_Detector, self).__init__(
            model_dir=model_dir,
            device=device,
@@ -84,6 +91,8 @@ class JDE_Detector(Detector):
            enable_mkldnn=enable_mkldnn,
            output_dir=output_dir,
            threshold=threshold, )
+        self.save_images = save_images
+        self.save_mot_txts = save_mot_txts
        assert batch_size == 1, "MOT model only supports batch_size=1."
        self.det_times = Timer(with_tracker=True)
        self.num_classes = len(self.pred_config.labels)
@@ -91,8 +100,8 @@ class JDE_Detector(Detector):
        # tracker config
        assert self.pred_config.tracker, "The exported JDE Detector model should have tracker."
        cfg = self.pred_config.tracker
-        min_box_area = cfg.get('min_box_area', 200)
-        vertical_ratio = cfg.get('vertical_ratio', 1.6)
+        min_box_area = cfg.get('min_box_area', 0.0)
+        vertical_ratio = cfg.get('vertical_ratio', 0.0)
        conf_thres = cfg.get('conf_thres', 0.0)
        tracked_thresh = cfg.get('tracked_thresh', 0.7)
        metric_type = cfg.get('metric_type', 'euclidean')
@@ -115,7 +124,7 @@ class JDE_Detector(Detector):
        return result

    def tracking(self, det_results):
-        pred_dets = det_results['pred_dets']  # 'cls_id, score, x0, y0, x1, y1'
+        pred_dets = det_results['pred_dets']  # cls_id, score, x0, y0, x1, y1
        pred_embs = det_results['pred_embs']
        online_targets_dict = self.tracker.update(pred_dets, pred_embs)

@@ -164,7 +173,8 @@ class JDE_Detector(Detector):
                      image_list,
                      run_benchmark=False,
                      repeats=1,
-                      visual=True):
+                      visual=True,
+                      seq_name=None):
        mot_results = []
        num_classes = self.num_classes
        image_list.sort()
@@ -225,7 +235,7 @@ class JDE_Detector(Detector):
                self.det_times.img_num += 1

            if visual:
-                if frame_id % 10 == 0:
+                if len(image_list) > 1 and frame_id % 10 == 0:
                    print('Tracking frame {}'.format(frame_id))
                frame, _ = decode_image(img_file, {})

@@ -237,7 +247,8 @@ class JDE_Detector(Detector):
                    online_scores,
                    frame_id=frame_id,
                    ids2names=ids2names)
-                seq_name = image_list[0].split('/')[-2]
+                if seq_name is None:
+                    seq_name = image_list[0].split('/')[-2]
                save_dir = os.path.join(self.output_dir, seq_name)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
@@ -264,7 +275,8 @@ class JDE_Detector(Detector):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        out_path = os.path.join(self.output_dir, video_out_name)
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_format = 'mp4v'
+        fourcc = cv2.VideoWriter_fourcc(*video_format)
        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))

        frame_id = 1
@@ -282,7 +294,9 @@ class JDE_Detector(Detector):
            frame_id += 1

            timer.tic()
-            mot_results = self.predict_image([frame], visual=False)
+            seq_name = video_out_name.split('.')[0]
+            mot_results = self.predict_image(
+                [frame], visual=False, seq_name=seq_name)
            timer.toc()

            online_tlwhs, online_scores, online_ids = mot_results[0]
@@ -307,20 +321,33 @@ class JDE_Detector(Detector):
                cv2.imshow('Mask Detection', im)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
+
+        if self.save_mot_txts:
+            result_filename = os.path.join(
+                self.output_dir, video_out_name.split('.')[-2] + '.txt')
+
+            write_mot_results(result_filename, results, data_type, num_classes)
+
        writer.release()


 def main():
    detector = JDE_Detector(
        FLAGS.model_dir,
+        tracker_config=None,
        device=FLAGS.device,
        run_mode=FLAGS.run_mode,
+        batch_size=1,
        trt_min_shape=FLAGS.trt_min_shape,
        trt_max_shape=FLAGS.trt_max_shape,
        trt_opt_shape=FLAGS.trt_opt_shape,
        trt_calib_mode=FLAGS.trt_calib_mode,
        cpu_threads=FLAGS.cpu_threads,
-        enable_mkldnn=FLAGS.enable_mkldnn)
+        enable_mkldnn=FLAGS.enable_mkldnn,
+        output_dir=FLAGS.output_dir,
+        threshold=FLAGS.threshold,
+        save_images=FLAGS.save_images,
+        save_mot_txts=FLAGS.save_mot_txts)

    # predict from video file or camera video stream
    if FLAGS.video_file is not None or FLAGS.camera_id != -1:

--- a/deploy/python/mot_keypoint_unite_infer.py
+++ b/deploy/python/mot_keypoint_unite_infer.py
@@ -24,7 +24,7 @@ from collections import defaultdict

 from mot_keypoint_unite_utils import argsparser
 from preprocess import decode_image
-from infer import print_arguments, get_test_images
+from infer import print_arguments, get_test_images, bench_log
 from mot_sde_infer import SDE_Detector
 from mot_jde_infer import JDE_Detector, MOT_JDE_SUPPORT_MODELS
 from keypoint_infer import KeyPointDetector, KEYPOINT_SUPPORT_MODELS
@@ -39,7 +39,7 @@ import sys
 parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
 sys.path.insert(0, parent_path)

-from pptracking.python.visualize import plot_tracking, plot_tracking_dict
+from pptracking.python.mot.visualize import plot_tracking, plot_tracking_dict
 from pptracking.python.mot.utils import MOTTimer as FPSTimer


@@ -92,7 +92,7 @@ def mot_topdown_unite_predict(mot_detector,

        keypoint_res = predict_with_given_det(
            image, results, topdown_keypoint_detector, keypoint_batch_size,
-            FLAGS.mot_threshold, FLAGS.keypoint_threshold, FLAGS.run_benchmark)
+            FLAGS.run_benchmark)

        if save_res:
            store_res.append([
@@ -146,7 +146,7 @@ def mot_topdown_unite_predict_video(mot_detector,
    if not os.path.exists(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)
    out_path = os.path.join(FLAGS.output_dir, video_name)
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
    frame_id = 0
    timer_mot, timer_kp, timer_mot_kp = FPSTimer(), FPSTimer(), FPSTimer()
@@ -179,7 +179,7 @@ def mot_topdown_unite_predict_video(mot_detector,
        timer_kp.tic()
        keypoint_res = predict_with_given_det(
            frame, results, topdown_keypoint_detector, keypoint_batch_size,
-            FLAGS.mot_threshold, FLAGS.keypoint_threshold, FLAGS.run_benchmark)
+            FLAGS.run_benchmark)
        timer_kp.toc()
        timer_mot_kp.toc()


--- a/deploy/python/mot_sde_infer.py
+++ b/deploy/python/mot_sde_infer.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,15 +23,15 @@ import paddle
 from benchmark_utils import PaddleInferBenchmark
 from preprocess import decode_image
 from utils import argsparser, Timer, get_current_memory_mb
-from infer import Detector, get_test_images, print_arguments, bench_log, PredictConfig
+from infer import Detector, get_test_images, print_arguments, bench_log, PredictConfig, load_predictor

 # add python path
 import sys
 parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
 sys.path.insert(0, parent_path)

-from pptracking.python.mot import JDETracker
-from pptracking.python.mot.utils import MOTTimer, write_mot_results
+from pptracking.python.mot import JDETracker, DeepSORTTracker
+from pptracking.python.mot.utils import MOTTimer, write_mot_results, get_crops, clip_box
 from pptracking.python.mot.visualize import plot_tracking, plot_tracking_dict


@@ -50,7 +50,11 @@ class SDE_Detector(Detector):
            calibration, trt_calib_mode need to set True
        cpu_threads (int): cpu threads
        enable_mkldnn (bool): whether to open MKLDNN
-        use_dark(bool): whether to use postprocess in DarkPose
+        output_dir (string): The path of output, default as 'output'
+        threshold (float): Score threshold of the detected bbox, default as 0.5
+        save_images (bool): Whether to save visualization image results, default as False
+        save_mot_txts (bool): Whether to save tracking results (txt), default as False
+        reid_model_dir (str): reid model dir, default None for ByteTrack, but set for DeepSORT
    """

    def __init__(self,
@@ -66,7 +70,10 @@ class SDE_Detector(Detector):
                 cpu_threads=1,
                 enable_mkldnn=False,
                 output_dir='output',
-                 threshold=0.5):
+                 threshold=0.5,
+                 save_images=False,
+                 save_mot_txts=False,
+                 reid_model_dir=None):
        super(SDE_Detector, self).__init__(
            model_dir=model_dir,
            device=device,
@@ -80,65 +87,198 @@ class SDE_Detector(Detector):
            enable_mkldnn=enable_mkldnn,
            output_dir=output_dir,
            threshold=threshold, )
+        self.save_images = save_images
+        self.save_mot_txts = save_mot_txts
        assert batch_size == 1, "MOT model only supports batch_size=1."
        self.det_times = Timer(with_tracker=True)
        self.num_classes = len(self.pred_config.labels)

-        # tracker config
+        # reid config
+        self.use_reid = False if reid_model_dir is None else True
+        if self.use_reid:
+            self.reid_pred_config = self.set_config(reid_model_dir)
+            self.reid_predictor, self.config = load_predictor(
+                reid_model_dir,
+                run_mode=run_mode,
+                batch_size=50,  # reid_batch_size
+                min_subgraph_size=self.reid_pred_config.min_subgraph_size,
+                device=device,
+                use_dynamic_shape=self.reid_pred_config.use_dynamic_shape,
+                trt_min_shape=trt_min_shape,
+                trt_max_shape=trt_max_shape,
+                trt_opt_shape=trt_opt_shape,
+                trt_calib_mode=trt_calib_mode,
+                cpu_threads=cpu_threads,
+                enable_mkldnn=enable_mkldnn)
+        else:
+            self.reid_pred_config = None
+            self.reid_predictor = None
+
+        assert tracker_config is not None, 'Note that tracker_config should be set.'
        self.tracker_config = tracker_config
-        cfg = yaml.safe_load(open(self.tracker_config))['tracker']
-        min_box_area = cfg.get('min_box_area', 200)
-        vertical_ratio = cfg.get('vertical_ratio', 1.6)
-        use_byte = cfg.get('use_byte', True)
-        match_thres = cfg.get('match_thres', 0.9)
-        conf_thres = cfg.get('conf_thres', 0.6)
-        low_conf_thres = cfg.get('low_conf_thres', 0.1)
-
-        self.tracker = JDETracker(
-            use_byte=use_byte,
-            num_classes=self.num_classes,
-            min_box_area=min_box_area,
-            vertical_ratio=vertical_ratio,
-            match_thres=match_thres,
-            conf_thres=conf_thres,
-            low_conf_thres=low_conf_thres)
+        tracker_cfg = yaml.safe_load(open(self.tracker_config))
+        cfg = tracker_cfg[tracker_cfg['type']]
+
+        # tracker config
+        self.use_deepsort_tracker = True if tracker_cfg[
+            'type'] == 'DeepSORTTracker' else False
+        if self.use_deepsort_tracker:
+            # use DeepSORTTracker
+            if self.reid_pred_config is not None and hasattr(
+                    self.reid_pred_config, 'tracker'):
+                cfg = self.reid_pred_config.tracker
+            budget = cfg.get('budget', 100)
+            max_age = cfg.get('max_age', 30)
+            max_iou_distance = cfg.get('max_iou_distance', 0.7)
+            matching_threshold = cfg.get('matching_threshold', 0.2)
+            min_box_area = cfg.get('min_box_area', 0)
+            vertical_ratio = cfg.get('vertical_ratio', 0)
+
+            self.tracker = DeepSORTTracker(
+                budget=budget,
+                max_age=max_age,
+                max_iou_distance=max_iou_distance,
+                matching_threshold=matching_threshold,
+                min_box_area=min_box_area,
+                vertical_ratio=vertical_ratio, )
+        else:
+            # use ByteTracker
+            use_byte = cfg.get('use_byte', False)
+            det_thresh = cfg.get('det_thresh', 0.3)
+            min_box_area = cfg.get('min_box_area', 0)
+            vertical_ratio = cfg.get('vertical_ratio', 0)
+            match_thres = cfg.get('match_thres', 0.9)
+            conf_thres = cfg.get('conf_thres', 0.6)
+            low_conf_thres = cfg.get('low_conf_thres', 0.1)
+
+            self.tracker = JDETracker(
+                use_byte=use_byte,
+                det_thresh=det_thresh,
+                num_classes=self.num_classes,
+                min_box_area=min_box_area,
+                vertical_ratio=vertical_ratio,
+                match_thres=match_thres,
+                conf_thres=conf_thres,
+                low_conf_thres=low_conf_thres, )
+
+    def postprocess(self, inputs, result):
+        # postprocess output of predictor
+        np_boxes_num = result['boxes_num']
+        if np_boxes_num[0] <= 0:
+            print('[WARNNING] No object detected.')
+            result = {'boxes': np.zeros([0, 6]), 'boxes_num': [0]}
+        result = {k: v for k, v in result.items() if v is not None}
+        return result
+
+    def reidprocess(self, det_results, repeats=1):
+        pred_dets = det_results['boxes']
+        pred_xyxys = pred_dets[:, 2:6]
+
+        ori_image = det_results['ori_image']
+        ori_image_shape = ori_image.shape[:2]
+        pred_xyxys, keep_idx = clip_box(pred_xyxys, ori_image_shape)
+
+        if len(keep_idx[0]) == 0:
+            det_results['boxes'] = np.zeros((1, 6), dtype=np.float32)
+            det_results['embeddings'] = None
+            return det_results
+
+        pred_dets = pred_dets[keep_idx[0]]
+        pred_xyxys = pred_dets[:, 2:6]
+
+        w, h = self.tracker.input_size
+        crops = get_crops(pred_xyxys, ori_image, w, h)
+
+        # to keep fast speed, only use topk crops
+        crops = crops[:50]  # reid_batch_size
+        det_results['crops'] = np.array(crops).astype('float32')
+        det_results['boxes'] = pred_dets[:50]
+
+        input_names = self.reid_predictor.get_input_names()
+        for i in range(len(input_names)):
+            input_tensor = self.reid_predictor.get_input_handle(input_names[i])
+            input_tensor.copy_from_cpu(det_results[input_names[i]])
+
+        # model prediction
+        for i in range(repeats):
+            self.reid_predictor.run()
+            output_names = self.reid_predictor.get_output_names()
+            feature_tensor = self.reid_predictor.get_output_handle(output_names[
+                0])
+            pred_embs = feature_tensor.copy_to_cpu()
+
+        det_results['embeddings'] = pred_embs
+        return det_results

    def tracking(self, det_results):
        pred_dets = det_results['boxes']  # 'cls_id, score, x0, y0, x1, y1'
-        pred_embs = None
-
-        online_targets_dict = self.tracker.update(pred_dets, pred_embs)
-        online_tlwhs = defaultdict(list)
-        online_scores = defaultdict(list)
-        online_ids = defaultdict(list)
-        for cls_id in range(self.num_classes):
-            online_targets = online_targets_dict[cls_id]
+        pred_embs = det_results.get('embeddings', None)
+
+        if self.use_deepsort_tracker:
+            # use DeepSORTTracker, only support singe class
+            self.tracker.predict()
+            online_targets = self.tracker.update(pred_dets, pred_embs)
+            online_tlwhs, online_scores, online_ids = [], [], []
            for t in online_targets:
-                tlwh = t.tlwh
-                tid = t.track_id
-                tscore = t.score
-                if tlwh[2] * tlwh[3] <= self.tracker.min_box_area:
+                if not t.is_confirmed() or t.time_since_update > 1:
                    continue
+                tlwh = t.to_tlwh()
+                tscore = t.score
+                tid = t.track_id
                if self.tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
                        3] > self.tracker.vertical_ratio:
                    continue
-                online_tlwhs[cls_id].append(tlwh)
-                online_ids[cls_id].append(tid)
-                online_scores[cls_id].append(tscore)
-
-        return online_tlwhs, online_scores, online_ids
+                online_tlwhs.append(tlwh)
+                online_scores.append(tscore)
+                online_ids.append(tid)
+
+            tracking_outs = {
+                'online_tlwhs': online_tlwhs,
+                'online_scores': online_scores,
+                'online_ids': online_ids,
+            }
+            return tracking_outs
+        else:
+            # use ByteTracker, support multiple class
+            online_tlwhs = defaultdict(list)
+            online_scores = defaultdict(list)
+            online_ids = defaultdict(list)
+            online_targets_dict = self.tracker.update(pred_dets, pred_embs)
+            for cls_id in range(self.num_classes):
+                online_targets = online_targets_dict[cls_id]
+                for t in online_targets:
+                    tlwh = t.tlwh
+                    tid = t.track_id
+                    tscore = t.score
+                    if tlwh[2] * tlwh[3] <= self.tracker.min_box_area:
+                        continue
+                    if self.tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
+                            3] > self.tracker.vertical_ratio:
+                        continue
+                    online_tlwhs[cls_id].append(tlwh)
+                    online_ids[cls_id].append(tid)
+                    online_scores[cls_id].append(tscore)
+
+            tracking_outs = {
+                'online_tlwhs': online_tlwhs,
+                'online_scores': online_scores,
+                'online_ids': online_ids,
+            }
+            return tracking_outs

    def predict_image(self,
                      image_list,
                      run_benchmark=False,
                      repeats=1,
-                      visual=True):
-        mot_results = []
+                      visual=True,
+                      seq_name=None):
        num_classes = self.num_classes
        image_list.sort()
        ids2names = self.pred_config.labels
+        mot_results = []
        for frame_id, img_file in enumerate(image_list):
            batch_image_list = [img_file]  # bs=1 in MOT model
+            frame, _ = decode_image(img_file, {})
            if run_benchmark:
                # preprocess
                inputs = self.preprocess(batch_image_list)  # warmup
@@ -159,10 +299,16 @@ class SDE_Detector(Detector):
                self.det_times.postprocess_time_s.end()

                # tracking
+                if self.use_reid:
+                    det_result['frame_id'] = frame_id
+                    det_result['seq_name'] = seq_name
+                    det_result['ori_image'] = frame
+                    det_result = self.reidprocess(det_result)
                result_warmup = self.tracking(det_result)
                self.det_times.tracking_time_s.start()
-                online_tlwhs, online_scores, online_ids = self.tracking(
-                    det_result)
+                if self.use_reid:
+                    det_result = self.reidprocess(det_result)
+                tracking_outs = self.tracking(det_result)
                self.det_times.tracking_time_s.end()
                self.det_times.img_num += 1

@@ -186,32 +332,47 @@ class SDE_Detector(Detector):

                # tracking process
                self.det_times.tracking_time_s.start()
-                online_tlwhs, online_scores, online_ids = self.tracking(
-                    det_result)
+                if self.use_reid:
+                    det_result['frame_id'] = frame_id
+                    det_result['seq_name'] = seq_name
+                    det_result['ori_image'] = frame
+                    det_result = self.reidprocess(det_result)
+                tracking_outs = self.tracking(det_result)
                self.det_times.tracking_time_s.end()
                self.det_times.img_num += 1

+            online_tlwhs = tracking_outs['online_tlwhs']
+            online_scores = tracking_outs['online_scores']
+            online_ids = tracking_outs['online_ids']
+
+            mot_results.append([online_tlwhs, online_scores, online_ids])
+
            if visual:
-                if frame_id % 10 == 0:
+                if len(image_list) > 1 and frame_id % 10 == 0:
                    print('Tracking frame {}'.format(frame_id))
                frame, _ = decode_image(img_file, {})
-
-                im = plot_tracking_dict(
-                    frame,
-                    num_classes,
-                    online_tlwhs,
-                    online_ids,
-                    online_scores,
-                    frame_id=frame_id,
-                    ids2names=[])
-                seq_name = image_list[0].split('/')[-2]
+                if isinstance(online_tlwhs, defaultdict):
+                    im = plot_tracking_dict(
+                        frame,
+                        num_classes,
+                        online_tlwhs,
+                        online_ids,
+                        online_scores,
+                        frame_id=frame_id,
+                        ids2names=[])
+                else:
+                    im = plot_tracking(
+                        frame,
+                        online_tlwhs,
+                        online_ids,
+                        online_scores,
+                        frame_id=frame_id)
                save_dir = os.path.join(self.output_dir, seq_name)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                cv2.imwrite(
                    os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), im)

-            mot_results.append([online_tlwhs, online_scores, online_ids])
        return mot_results

    def predict_video(self, video_file, camera_id):
@@ -231,13 +392,17 @@ class SDE_Detector(Detector):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        out_path = os.path.join(self.output_dir, video_out_name)
-        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+        video_format = 'mp4v'
+        fourcc = cv2.VideoWriter_fourcc(*video_format)
        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))

        frame_id = 1
        timer = MOTTimer()
-        results = defaultdict(list)  # support single class and multi classes
+        results = defaultdict(list)
        num_classes = self.num_classes
+        data_type = 'mcmot' if num_classes > 1 else 'mot'
+        ids2names = self.pred_config.labels
+
        while (1):
            ret, frame = capture.read()
            if not ret:
@@ -247,31 +412,53 @@ class SDE_Detector(Detector):
            frame_id += 1

            timer.tic()
-            mot_results = self.predict_image([frame], visual=False)
+            seq_name = video_out_name.split('.')[0]
+            mot_results = self.predict_image(
+                [frame], visual=False, seq_name=seq_name)
            timer.toc()

+            # bs=1 in MOT model
            online_tlwhs, online_scores, online_ids = mot_results[0]
-            for cls_id in range(num_classes):
-                results[cls_id].append(
-                    (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id],
-                     online_ids[cls_id]))

            fps = 1. / timer.duration
-            im = plot_tracking_dict(
-                frame,
-                num_classes,
-                online_tlwhs,
-                online_ids,
-                online_scores,
-                frame_id=frame_id,
-                fps=fps,
-                ids2names=[])
+            if self.use_deepsort_tracker:
+                # use DeepSORTTracker, only support singe class
+                results[0].append(
+                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
+                im = plot_tracking(
+                    frame,
+                    online_tlwhs,
+                    online_ids,
+                    online_scores,
+                    frame_id=frame_id,
+                    fps=fps)
+            else:
+                # use ByteTracker, support multiple class
+                for cls_id in range(num_classes):
+                    results[cls_id].append(
+                        (frame_id + 1, online_tlwhs[cls_id],
+                         online_scores[cls_id], online_ids[cls_id]))
+                im = plot_tracking_dict(
+                    frame,
+                    num_classes,
+                    online_tlwhs,
+                    online_ids,
+                    online_scores,
+                    frame_id=frame_id,
+                    fps=fps,
+                    ids2names=ids2names)

            writer.write(im)
            if camera_id != -1:
                cv2.imshow('Mask Detection', im)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
+
+        if self.save_mot_txts:
+            result_filename = os.path.join(
+                self.output_dir, video_out_name.split('.')[-2] + '.txt')
+            write_mot_results(result_filename, results)
+
        writer.release()


@@ -282,18 +469,20 @@ def main():
    arch = yml_conf['arch']
    detector = SDE_Detector(
        FLAGS.model_dir,
-        FLAGS.tracker_config,
+        tracker_config=FLAGS.tracker_config,
        device=FLAGS.device,
        run_mode=FLAGS.run_mode,
-        batch_size=FLAGS.batch_size,
+        batch_size=1,
        trt_min_shape=FLAGS.trt_min_shape,
        trt_max_shape=FLAGS.trt_max_shape,
        trt_opt_shape=FLAGS.trt_opt_shape,
        trt_calib_mode=FLAGS.trt_calib_mode,
        cpu_threads=FLAGS.cpu_threads,
        enable_mkldnn=FLAGS.enable_mkldnn,
+        output_dir=FLAGS.output_dir,
        threshold=FLAGS.threshold,
-        output_dir=FLAGS.output_dir)
+        save_images=FLAGS.save_images,
+        save_mot_txts=FLAGS.save_mot_txts, )

    # predict from video file or camera video stream
    if FLAGS.video_file is not None or FLAGS.camera_id != -1:
@@ -303,7 +492,9 @@ def main():
        if FLAGS.image_dir is None and FLAGS.image_file is not None:
            assert FLAGS.batch_size == 1, "--batch_size should be 1 in MOT models."
        img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        detector.predict_image(img_list, FLAGS.run_benchmark, repeats=10)
+        seq_name = FLAGS.image_dir.split('/')[-1]
+        detector.predict_image(
+            img_list, FLAGS.run_benchmark, repeats=10, seq_name=seq_name)

        if not FLAGS.run_benchmark:
            detector.det_times.info(average=True)

--- a/deploy/python/tracker_config.yml
+++ b/deploy/python/tracker_config.yml
-# config of tracker for MOT SDE Detector, use ByteTracker as default.
-# The tracker of MOT JDE Detector is exported together with the model.
+# config of tracker for MOT SDE Detector, use 'JDETracker' as default.
+# The tracker of MOT JDE Detector (such as FairMOT) is exported together with the model.
 # Here 'min_box_area' and 'vertical_ratio' are set for pedestrian, you can modify for other objects tracking.
-tracker:
-  use_byte: true
+
+type: JDETracker # 'JDETracker' or 'DeepSORTTracker'
+
+# BYTETracker
+JDETracker:
+  use_byte: True
+  det_thresh: 0.3
  conf_thres: 0.6
  low_conf_thres: 0.1
  match_thres: 0.9
-  min_box_area: 100
-  vertical_ratio: 1.6
+  min_box_area: 0
+  vertical_ratio: 0 # 1.6 for pedestrian
+
+DeepSORTTracker:
+  input_size: [64, 192]
+  min_box_area: 0
+  vertical_ratio: -1
+  budget: 100
+  max_age: 70
+  n_init: 3
+  metric_type: cosine
+  matching_threshold: 0.2
+  max_iou_distance: 0.9
--- a/ppdet/modeling/mot/tracker/jde_tracker.py
+++ b/ppdet/modeling/mot/tracker/jde_tracker.py
@@ -44,7 +44,7 @@ class JDETracker(object):
        track_buffer (int): buffer for tracker
        min_box_area (int): min box area to filter out low quality boxes
        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter
-            bad results. If set <0 means no need to filter bboxes，usually set
+            bad results. If set <= 0 means no need to filter bboxes，usually set
            1.6 for pedestrian tracking.
        tracked_thresh (float): linear assignment threshold of tracked 
            stracks and detections
@@ -70,8 +70,8 @@ class JDETracker(object):
                 num_classes=1,
                 det_thresh=0.3,
                 track_buffer=30,
-                 min_box_area=200,
-                 vertical_ratio=1.6,
+                 min_box_area=0,
+                 vertical_ratio=0,
                 tracked_thresh=0.7,
                 r_tracked_thresh=0.5,
                 unconfirmed_thresh=0.7,
@@ -167,9 +167,8 @@ class JDETracker(object):
                    detections = [
                        STrack(
                            STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id,
-                            30, temp_feat)
-                        for (tlbrs, temp_feat
-                             ) in zip(pred_dets_cls, pred_embs_cls)
+                            30, temp_feat) for (tlbrs, temp_feat) in
+                        zip(pred_dets_cls, pred_embs_cls)
                    ]
            else:
                detections = []
@@ -244,15 +243,13 @@ class JDETracker(object):
                            for tlbrs in pred_dets_cls_second
                        ]
                    else:
-                        pred_embs_cls_second = pred_embs_dict[cls_id][inds_second]
+                        pred_embs_cls_second = pred_embs_dict[cls_id][
+                            inds_second]
                        detections_second = [
                            STrack(
-                                STrack.tlbr_to_tlwh(tlbrs[2:6]),
-                                tlbrs[1],
-                                cls_id,
-                                30,
-                                temp_feat)
-                            for (tlbrs, temp_feat) in zip(pred_dets_cls_second, pred_embs_cls_second)
+                                STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1],
+                                cls_id, 30, temp_feat) for (tlbrs, temp_feat) in
+                            zip(pred_dets_cls_second, pred_embs_cls_second)
                        ]
                else:
                    detections_second = []