From 0c2b74f962df97a852e3c9ef5538945745414541 Mon Sep 17 00:00:00 2001
From: George Ni <nemonameless@qq.com>
Date: Tue, 20 Jul 2021 17:28:26 +0800
Subject: [PATCH] [MOT] Fix mot video decode (#3681)

* mot video decode to images

* add infer_dir for mot

* fix export and unite pose infer

* remove task in mot source

* set image_dir

* update doc
---
 configs/datasets/mot.yml                      |  17 +--
 configs/mot/README.md                         |  11 +-
 configs/mot/README_cn.md                      |  10 +-
 .../_base_/deepsort_reader_1088x608.yml       |   1 +
 .../deepsort/deepsort_pcb_pyramid_r101.yml    |   1 -
 .../deepsort_yolov3_pcb_pyramid_r101.yml      |   1 -
 configs/mot/fairmot/README.md                 |   1 -
 configs/mot/fairmot/README_cn.md              |   1 -
 .../_base_/fairmot_reader_1088x608.yml        |   3 +-
 configs/mot/jde/README.md                     |   1 -
 configs/mot/jde/README_cn.md                  |   1 -
 .../mot/jde/_base_/jde_reader_1088x608.yml    |   1 +
 configs/mot/jde/_base_/jde_reader_576x320.yml |   1 +
 configs/mot/jde/_base_/jde_reader_864x480.yml |   1 +
 deploy/python/mot_jde_infer.py                |   5 +-
 deploy/python/mot_keypoint_unite_infer.py     |   4 +-
 ppdet/data/source/mot.py                      | 144 ++++++++++--------
 ppdet/engine/export_utils.py                  |   6 +-
 ppdet/engine/tracker.py                       |  40 ++++-
 requirements.txt                              |   1 -
 tools/eval_mot.py                             |   4 +-
 tools/infer_mot.py                            |   6 +
 22 files changed, 159 insertions(+), 102 deletions(-)

diff --git a/configs/datasets/mot.yml b/configs/datasets/mot.yml
index d6056968d..7107da490 100644
--- a/configs/datasets/mot.yml
+++ b/configs/datasets/mot.yml
@@ -1,18 +1,6 @@
 metric: MOT
 num_classes: 1
 
-MOTDataZoo: {
-  'MOT15_train': ['ADL-Rundle-6', 'ADL-Rundle-8', 'ETH-Bahnhof', 'ETH-Pedcross2', 'ETH-Sunnyday', 'KITTI-13', 'KITTI-17', 'PETS09-S2L1', 'TUD-Campus', 'TUD-Stadtmitte', 'Venice-2'],
-  'MOT15_test': ['ADL-Rundle-1', 'ADL-Rundle-3', 'AVG-TownCentre', 'ETH-Crossing', 'ETH-Jelmoli', 'ETH-Linthescher', 'KITTI-16', 'KITTI-19', 'PETS09-S2L2', 'TUD-Crossing', 'Venice-1'],
-  'MOT16_train': ['MOT16-02', 'MOT16-04', 'MOT16-05', 'MOT16-09', 'MOT16-10', 'MOT16-11', 'MOT16-13'],
-  'MOT16_test': ['MOT16-01', 'MOT16-03', 'MOT16-06', 'MOT16-07', 'MOT16-08', 'MOT16-12', 'MOT16-14'],
-  'MOT17_train': ['MOT17-02-SDP', 'MOT17-04-SDP', 'MOT17-05-SDP', 'MOT17-09-SDP', 'MOT17-10-SDP', 'MOT17-11-SDP', 'MOT17-13-SDP'],
-  'MOT17_test': ['MOT17-01-SDP', 'MOT17-03-SDP', 'MOT17-06-SDP', 'MOT17-07-SDP', 'MOT17-08-SDP', 'MOT17-12-SDP', 'MOT17-14-SDP'],
-  'MOT20_train': ['MOT20-01', 'MOT20-02', 'MOT20-03', 'MOT20-05'],
-  'MOT20_test': ['MOT20-04', 'MOT20-06', 'MOT20-07', 'MOT20-08'],
-  'demo': ['MOT16-02'],
-}
-
 # for MOT training
 TrainDataset:
   !MOTDataSet
@@ -21,16 +9,15 @@ TrainDataset:
     data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide']
 
 # for MOT evaluation
-# If you want to change the MOT evaluation dataset, please modify 'task' and 'data_root'
+# If you want to change the MOT evaluation dataset, please modify 'data_root'
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT16_train
     dataset_dir: dataset/mot
     data_root: MOT16/images/train
     keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT
 
 # for MOT video inference
 TestMOTDataset:
-  !MOTVideoDataset
+  !MOTImageFolder
     dataset_dir: dataset/mot
     keep_ori_im: True # set True if save visualization images or video
diff --git a/configs/mot/README.md b/configs/mot/README.md
index 1a34f9d07..b33c3d7ad 100644
--- a/configs/mot/README.md
+++ b/configs/mot/README.md
@@ -224,11 +224,10 @@ CUDA_VISIBLE_DEVICES=0 python tools/eval_mot.py -c configs/mot/fairmot/fairmot_d
 CUDA_VISIBLE_DEVICES=0 python tools/eval_mot.py -c configs/mot/fairmot/fairmot_dla34_30e_1088x608.yml -o weights=output/fairmot_dla34_30e_1088x608/model_final.pdparams
 ```
 **Notes:**
- The default evaluation dataset is MOT-16 Train Set. If you want to change the evaluation dataset, please refer to the following code and modify `configs/datasets/mot.yml`：
+ The default evaluation dataset is MOT-16 Train Set. If you want to change the evaluation dataset, please refer to the following code and modify `configs/datasets/mot.yml`, modify `data_root`：
 ```
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT17_train
     dataset_dir: dataset/mot
     data_root: MOT17/images/train
     keep_ori_im: False # set True if save visualization images or video
@@ -242,6 +241,14 @@ Inference a vidoe on single GPU with following command:
 # inference on video and save a video
 CUDA_VISIBLE_DEVICES=0 python tools/infer_mot.py -c configs/mot/fairmot/fairmot_dla34_30e_1088x608.yml -o weights=https://paddledet.bj.bcebos.com/models/mot/fairmot_dla34_30e_1088x608.pdparams --video_file={your video name}.mp4  --save_videos
 ```
+
+Inference a image folder on single GPU with following command:
+
+```bash
+# inference image folder and save a video
+CUDA_VISIBLE_DEVICES=0 python tools/infer_mot.py -c configs/mot/fairmot/fairmot_dla34_30e_1088x608.yml -o weights=https://paddledet.bj.bcebos.com/models/mot/fairmot_dla34_30e_1088x608.pdparams --image_dir={your infer images folder} --save_videos
+```
+
 **Notes:**
  Please make sure that [ffmpeg](https://ffmpeg.org/ffmpeg.html) is installed first, on Linux(Ubuntu) platform you can directly install it by the following command:`apt-get update && apt-get install -y ffmpeg`.
 
diff --git a/configs/mot/README_cn.md b/configs/mot/README_cn.md
index 8025f54ac..b5ab66c07 100644
--- a/configs/mot/README_cn.md
+++ b/configs/mot/README_cn.md
@@ -222,11 +222,10 @@ CUDA_VISIBLE_DEVICES=0 python tools/eval_mot.py -c configs/mot/fairmot/fairmot_d
 CUDA_VISIBLE_DEVICES=0 python tools/eval_mot.py -c configs/mot/fairmot/fairmot_dla34_30e_1088x608.yml -o weights=output/fairmot_dla34_30e_1088x608/model_final.pdparams
 ```
 **注意:**
- 默认评估的是MOT-16 Train Set数据集, 如需换评估数据集可参照以下代码修改`configs/datasets/mot.yml`：
+ 默认评估的是MOT-16 Train Set数据集，如需换评估数据集可参照以下代码修改`configs/datasets/mot.yml`，修改`data_root`：
 ```
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT17_train
     dataset_dir: dataset/mot
     data_root: MOT17/images/train
     keep_ori_im: False # set True if save visualization images or video
@@ -241,6 +240,13 @@ EvalMOTDataset:
 CUDA_VISIBLE_DEVICES=0 python tools/infer_mot.py -c configs/mot/fairmot/fairmot_dla34_30e_1088x608.yml -o weights=https://paddledet.bj.bcebos.com/models/mot/fairmot_dla34_30e_1088x608.pdparams --video_file={your video name}.mp4  --save_videos
 ```
 
+使用单个GPU通过如下命令预测一个图片文件夹，并保存为视频
+
+```bash
+# 预测一个图片文件夹
+CUDA_VISIBLE_DEVICES=0 python tools/infer_mot.py -c configs/mot/fairmot/fairmot_dla34_30e_1088x608.yml -o weights=https://paddledet.bj.bcebos.com/models/mot/fairmot_dla34_30e_1088x608.pdparams --image_dir={your infer images folder} --save_videos
+```
+
 **注意:**
  请先确保已经安装了[ffmpeg](https://ffmpeg.org/ffmpeg.html), Linux(Ubuntu)平台可以直接用以下命令安装：`apt-get update && apt-get install -y ffmpeg`。
 
diff --git a/configs/mot/deepsort/_base_/deepsort_reader_1088x608.yml b/configs/mot/deepsort/_base_/deepsort_reader_1088x608.yml
index 0ef445085..1bbc28fd8 100644
--- a/configs/mot/deepsort/_base_/deepsort_reader_1088x608.yml
+++ b/configs/mot/deepsort/_base_/deepsort_reader_1088x608.yml
@@ -11,6 +11,7 @@ TestMOTReader:
   inputs_def:
     image_shape: [3, 608, 1088]
   sample_transforms:
+    - Decode: {}
     - LetterBoxResize: {target_size: [608, 1088]}
     - NormalizeImage: {mean: [0, 0, 0], std: [1, 1, 1], is_scale: True}
     - Permute: {}
diff --git a/configs/mot/deepsort/deepsort_pcb_pyramid_r101.yml b/configs/mot/deepsort/deepsort_pcb_pyramid_r101.yml
index f6a4bd19c..c79d1f291 100644
--- a/configs/mot/deepsort/deepsort_pcb_pyramid_r101.yml
+++ b/configs/mot/deepsort/deepsort_pcb_pyramid_r101.yml
@@ -7,7 +7,6 @@ _BASE_: [
 
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT16_train
     dataset_dir: dataset/mot
     data_root: MOT16/images/train
     keep_ori_im: True # set as True in DeepSORT
diff --git a/configs/mot/deepsort/deepsort_yolov3_pcb_pyramid_r101.yml b/configs/mot/deepsort/deepsort_yolov3_pcb_pyramid_r101.yml
index ca8ec5d56..58aa29ed1 100644
--- a/configs/mot/deepsort/deepsort_yolov3_pcb_pyramid_r101.yml
+++ b/configs/mot/deepsort/deepsort_yolov3_pcb_pyramid_r101.yml
@@ -7,7 +7,6 @@ _BASE_: [
 
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT16_train
     dataset_dir: dataset/mot
     data_root: MOT16/images/train
     keep_ori_im: True # set as True in DeepSORT
diff --git a/configs/mot/fairmot/README.md b/configs/mot/fairmot/README.md
index c1d5616e3..4f139d8c4 100644
--- a/configs/mot/fairmot/README.md
+++ b/configs/mot/fairmot/README.md
@@ -59,7 +59,6 @@ CUDA_VISIBLE_DEVICES=0 python tools/eval_mot.py -c configs/mot/fairmot/fairmot_d
 ```
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT17_train
     dataset_dir: dataset/mot
     data_root: MOT17/images/train
     keep_ori_im: False # set True if save visualization images or video
diff --git a/configs/mot/fairmot/README_cn.md b/configs/mot/fairmot/README_cn.md
index 18b3428bf..ae4b98a22 100644
--- a/configs/mot/fairmot/README_cn.md
+++ b/configs/mot/fairmot/README_cn.md
@@ -57,7 +57,6 @@ CUDA_VISIBLE_DEVICES=0 python tools/eval_mot.py -c configs/mot/fairmot/fairmot_d
 ```
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT17_train
     dataset_dir: dataset/mot
     data_root: MOT17/images/train
     keep_ori_im: False # set True if save visualization images or video
diff --git a/configs/mot/fairmot/_base_/fairmot_reader_1088x608.yml b/configs/mot/fairmot/_base_/fairmot_reader_1088x608.yml
index 03834508c..1530d6928 100644
--- a/configs/mot/fairmot/_base_/fairmot_reader_1088x608.yml
+++ b/configs/mot/fairmot/_base_/fairmot_reader_1088x608.yml
@@ -22,8 +22,6 @@ TrainReader:
 
 
 EvalMOTReader:
-  inputs_def:
-    image_shape: [3, 608, 1088]
   sample_transforms:
     - Decode: {}
     - LetterBoxResize: {target_size: [608, 1088]}
@@ -36,6 +34,7 @@ TestMOTReader:
   inputs_def:
     image_shape: [3, 608, 1088]
   sample_transforms:
+    - Decode: {}
     - LetterBoxResize: {target_size: [608, 1088]}
     - NormalizeImage: {mean: [0, 0, 0], std: [1, 1, 1]}
     - Permute: {}
diff --git a/configs/mot/jde/README.md b/configs/mot/jde/README.md
index 2b66e7273..f71ece8ee 100644
--- a/configs/mot/jde/README.md
+++ b/configs/mot/jde/README.md
@@ -65,7 +65,6 @@ CUDA_VISIBLE_DEVICES=0 python tools/eval_mot.py -c configs/mot/jde/jde_darknet53
 ```
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT17_train
     dataset_dir: dataset/mot
     data_root: MOT17/images/train
     keep_ori_im: False # set True if save visualization images or video
diff --git a/configs/mot/jde/README_cn.md b/configs/mot/jde/README_cn.md
index e441c4478..1aa2d3830 100644
--- a/configs/mot/jde/README_cn.md
+++ b/configs/mot/jde/README_cn.md
@@ -66,7 +66,6 @@ CUDA_VISIBLE_DEVICES=0 python tools/eval_mot.py -c configs/mot/jde/jde_darknet53
 ```
 EvalMOTDataset:
   !MOTImageFolder
-    task: MOT17_train
     dataset_dir: dataset/mot
     data_root: MOT17/images/train
     keep_ori_im: False # set True if save visualization images or video
diff --git a/configs/mot/jde/_base_/jde_reader_1088x608.yml b/configs/mot/jde/_base_/jde_reader_1088x608.yml
index 3e41b3721..922f08b98 100644
--- a/configs/mot/jde/_base_/jde_reader_1088x608.yml
+++ b/configs/mot/jde/_base_/jde_reader_1088x608.yml
@@ -41,6 +41,7 @@ TestMOTReader:
   inputs_def:
     image_shape: [3, 608, 1088]
   sample_transforms:
+    - Decode: {}
     - LetterBoxResize: {target_size: [608, 1088]}
     - NormalizeImage: {mean: [0, 0, 0], std: [1, 1, 1], is_scale: True}
     - Permute: {}
diff --git a/configs/mot/jde/_base_/jde_reader_576x320.yml b/configs/mot/jde/_base_/jde_reader_576x320.yml
index fc50f4400..d1205ada7 100644
--- a/configs/mot/jde/_base_/jde_reader_576x320.yml
+++ b/configs/mot/jde/_base_/jde_reader_576x320.yml
@@ -41,6 +41,7 @@ TestMOTReader:
   inputs_def:
     image_shape: [3, 320, 576]
   sample_transforms:
+    - Decode: {}
     - LetterBoxResize: {target_size: [320, 576]}
     - NormalizeImage: {mean: [0, 0, 0], std: [1, 1, 1], is_scale: True}
     - Permute: {}
diff --git a/configs/mot/jde/_base_/jde_reader_864x480.yml b/configs/mot/jde/_base_/jde_reader_864x480.yml
index 9178fa2b2..439eced58 100644
--- a/configs/mot/jde/_base_/jde_reader_864x480.yml
+++ b/configs/mot/jde/_base_/jde_reader_864x480.yml
@@ -41,6 +41,7 @@ TestMOTReader:
   inputs_def:
     image_shape: [3, 480, 864]
   sample_transforms:
+    - Decode: {}
     - LetterBoxResize: {target_size: [480, 864]}
     - NormalizeImage: {mean: [0, 0, 0], std: [1, 1, 1], is_scale: True}
     - Permute: {}
diff --git a/deploy/python/mot_jde_infer.py b/deploy/python/mot_jde_infer.py
index 651fac58b..021e5aea7 100644
--- a/deploy/python/mot_jde_infer.py
+++ b/deploy/python/mot_jde_infer.py
@@ -84,7 +84,10 @@ class JDE_Detector(Detector):
         conf_thres = tp['conf_thres'] if 'conf_thres' in tp else 0.
         tracked_thresh = tp['tracked_thresh'] if 'tracked_thresh' in tp else 0.7
         metric_type = tp['metric_type'] if 'metric_type' in tp else 'euclidean'
-        self.tracker = JDETracker(conf_thres=conf_thres, tracked_thresh=tracked_thresh, metric_type=metric_type)
+        self.tracker = JDETracker(
+            conf_thres=conf_thres,
+            tracked_thresh=tracked_thresh,
+            metric_type=metric_type)
 
     def postprocess(self, pred_dets, pred_embs, threshold):
         online_targets = self.tracker.update(pred_dets, pred_embs)
diff --git a/deploy/python/mot_keypoint_unite_infer.py b/deploy/python/mot_keypoint_unite_infer.py
index 1ef333aca..46ecefe78 100644
--- a/deploy/python/mot_keypoint_unite_infer.py
+++ b/deploy/python/mot_keypoint_unite_infer.py
@@ -178,7 +178,9 @@ def mot_keypoint_unite_predict_video(mot_model,
             keypoint_results,
             visual_thread=FLAGS.keypoint_threshold,
             returnimg=True,
-            ids=online_ids)
+            ids=online_ids
+            if KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown' else
+            None)
 
         online_im = mot_vis.plot_tracking(
             im,
diff --git a/ppdet/data/source/mot.py b/ppdet/data/source/mot.py
index d1becd2fb..b5050986f 100644
--- a/ppdet/data/source/mot.py
+++ b/ppdet/data/source/mot.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 import os
+import sys
+import cv2
+import glob
 import numpy as np
-import decord as de
 from collections import OrderedDict
 try:
     from collections.abc import Sequence
@@ -228,8 +230,18 @@ def mot_label():
 @register
 @serializable
 class MOTImageFolder(DetDataset):
+    """
+    Load MOT dataset with MOT format from image folder or video .
+    Args:
+        video_file (str): path of the video file, default ''.
+        dataset_dir (str): root directory for dataset.
+        keep_ori_im (bool): whether to keep original image, default False. 
+            Set True when used during MOT model inference while saving
+            images or video, or used in DeepSORT.
+    """
+
     def __init__(self,
-                 task,
+                 video_file=None,
                  dataset_dir=None,
                  data_root=None,
                  image_dir=None,
@@ -238,20 +250,53 @@ class MOTImageFolder(DetDataset):
                  **kwargs):
         super(MOTImageFolder, self).__init__(
             dataset_dir, image_dir, sample_num=sample_num)
-        self.task = task
+        self.video_file = video_file
         self.data_root = data_root
         self.keep_ori_im = keep_ori_im
         self._imid2path = {}
         self.roidbs = None
+        self.frame_rate = 30
 
     def check_or_download_dataset(self):
         return
 
     def parse_dataset(self, ):
         if not self.roidbs:
-            self.roidbs = self._load_images()
+            if self.video_file is None:
+                self.roidbs = self._load_images()
+            else:
+                self.roidbs = self._load_video_images()
+
+    def _load_video_images(self):
+        cap = cv2.VideoCapture(self.video_file)
+        self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
+
+        extension = self.video_file.split('.')[-1]
+        output_path = self.video_file.replace('.{}'.format(extension), '')
+        frames_path = video2frames(self.video_file, output_path)
+        self.video_frames = sorted(
+            glob.glob(os.path.join(frames_path, '*.png')))
+
+        self.video_length = len(self.video_frames)
+        logger.info('Length of the video: {:d} frames.'.format(
+            self.video_length))
+        ct = 0
+        records = []
+        for image in self.video_frames:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            if self.keep_ori_im:
+                rec.update({'keep_ori_im': 1})
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
 
-    def _parse(self):
+    def _find_images(self):
         image_dir = self.image_dir
         if not isinstance(image_dir, Sequence):
             image_dir = [image_dir]
@@ -265,7 +310,7 @@ class MOTImageFolder(DetDataset):
         return images
 
     def _load_images(self):
-        images = self._parse()
+        images = self._find_images()
         ct = 0
         records = []
         for image in images:
@@ -289,67 +334,44 @@ class MOTImageFolder(DetDataset):
         self.image_dir = images
         self.roidbs = self._load_images()
 
+    def set_video(self, video_file):
+        self.video_file = video_file
+        assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \
+                "wrong or unsupported file format: {}".format(self.video_file)
+        self.roidbs = self._load_video_images()
+
 
 def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')):
     return f.lower().endswith(extensions)
 
 
-@register
-@serializable
-class MOTVideoDataset(DetDataset):
-    """
-    Load MOT dataset with MOT format from video for inference.
-    Args:
-        video_file (str): path of the video file
-        dataset_dir (str): root directory for dataset.
-        keep_ori_im (bool): whether to keep original image, default False. 
-            Set True when used during MOT model inference while saving
-            images or video, or used in DeepSORT.
-    """
+def video2frames(video_path, outpath, **kargs):
+    def _dict2str(kargs):
+        cmd_str = ''
+        for k, v in kargs.items():
+            cmd_str += (' ' + str(k) + ' ' + str(v))
+        return cmd_str
 
-    def __init__(self,
-                 video_file='',
-                 dataset_dir=None,
-                 keep_ori_im=False,
-                 **kwargs):
-        super(MOTVideoDataset, self).__init__(dataset_dir=dataset_dir)
-        self.video_file = video_file
-        self.dataset_dir = dataset_dir
-        self.keep_ori_im = keep_ori_im
-        self.roidbs = None
-        self.frame_rate = 25
+    ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error ']
+    vid_name = os.path.basename(video_path).split('.')[0]
+    out_full_path = os.path.join(outpath, vid_name)
 
-    def parse_dataset(self, ):
-        if not self.roidbs:
-            self.roidbs = self._load_video_images()
+    if not os.path.exists(out_full_path):
+        os.makedirs(out_full_path)
 
-    def _load_video_images(self):
-        self.video_frames = de.VideoReader(self.video_file)
-        self.video_length = len(self.video_frames)
-        logger.info('Length of the video: {:d} frames.'.format(
-            self.video_length))
-        records = []
-        for idx in range(self.video_length):
-            image = self.video_frames.get_batch([idx]).asnumpy()[0]
-            im_shape = image.shape
-            rec = {
-                'im_id': np.array([idx]),
-                'image': image,
-                'h': im_shape[0],
-                'w': im_shape[1],
-                'im_shape': np.array(
-                    im_shape[:2], dtype=np.float32),
-                'scale_factor': np.array(
-                    [1., 1.], dtype=np.float32),
-            }
-            if self.keep_ori_im:
-                rec.update({'ori_image': image})
-            records.append(rec)
-        assert len(records) > 0, "No image file found."
-        return records
+    # video file name
+    outformat = os.path.join(out_full_path, '%08d.png')
 
-    def set_video(self, video_file):
-        self.video_file = video_file
-        assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \
-                "wrong or unsupported file format: {}".format(self.video_file)
-        self.roidbs = self._load_video_images()
+    cmd = ffmpeg
+    cmd = ffmpeg + [' -i ', video_path, ' -start_number ', ' 0 ', outformat]
+    cmd = ''.join(cmd) + _dict2str(kargs)
+
+    try:
+        os.system(cmd)
+    except:
+        raise RuntimeError('ffmpeg process video: {} error'.format(vid_name))
+        sys.stdout.flush()
+        sys.exit(-1)
+
+    sys.stdout.flush()
+    return out_full_path
diff --git a/ppdet/engine/export_utils.py b/ppdet/engine/export_utils.py
index 4f12b554f..0fe932af4 100644
--- a/ppdet/engine/export_utils.py
+++ b/ppdet/engine/export_utils.py
@@ -58,9 +58,7 @@ def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
     label_list = [str(cat) for cat in catid2name.values()]
 
     sample_transforms = reader_cfg['sample_transforms']
-    if arch != 'mot_arch':
-        sample_transforms = sample_transforms[1:]
-    for st in sample_transforms:
+    for st in sample_transforms[1:]:
         for key, value in st.items():
             p = {'type': key}
             if key == 'Resize':
@@ -82,12 +80,14 @@ def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
 
     return preprocess_list, label_list
 
+
 def _parse_tracker(tracker_cfg):
     tracker_params = {}
     for k, v in tracker_cfg.items():
         tracker_params.update({k: v})
     return tracker_params
 
+
 def _dump_infer_config(config, path, image_shape, model):
     arch_state = False
     from ppdet.core.config.yaml_helpers import setup_orderdict
diff --git a/ppdet/engine/tracker.py b/ppdet/engine/tracker.py
index 8e84f71a1..5e7383709 100644
--- a/ppdet/engine/tracker.py
+++ b/ppdet/engine/tracker.py
@@ -282,18 +282,25 @@ class Tracker(object):
         n_frame = 0
         timer_avgs, timer_calls = [], []
         for seq in seqs:
+            if not os.path.isdir(os.path.join(data_root, seq)):
+                continue
+            infer_dir = os.path.join(data_root, seq, 'img1')
+            seqinfo = os.path.join(data_root, seq, 'seqinfo.ini')
+            if not os.path.exists(seqinfo) or not os.path.exists(
+                    infer_dir) or not os.path.isdir(infer_dir):
+                continue
+
             save_dir = os.path.join(output_dir, 'mot_outputs',
                                     seq) if save_images or save_videos else None
             logger.info('start seq: {}'.format(seq))
 
-            infer_dir = os.path.join(data_root, seq, 'img1')
             images = self.get_infer_images(infer_dir)
             self.dataset.set_images(images)
 
             dataloader = create('EvalMOTReader')(self.dataset, 0)
 
             result_filename = os.path.join(result_root, '{}.txt'.format(seq))
-            meta_info = open(os.path.join(data_root, seq, 'seqinfo.ini')).read()
+            meta_info = open(seqinfo).read()
             frame_rate = int(meta_info[meta_info.find('frameRate') + 10:
                                        meta_info.find('\nseqLength')])
             with paddle.no_grad():
@@ -365,6 +372,7 @@ class Tracker(object):
 
     def mot_predict(self,
                     video_file,
+                    image_dir,
                     output_dir,
                     data_type='mot',
                     model_type='JDE',
@@ -373,6 +381,13 @@ class Tracker(object):
                     show_image=False,
                     det_results_dir='',
                     draw_threshold=0.5):
+        assert video_file is not None or image_dir is not None, \
+            "--video_file or --image_dir should be set."
+        assert video_file is None or os.path.isfile(video_file), \
+                "{} is not a file".format(video_file)
+        assert image_dir is None or os.path.isdir(image_dir), \
+                "{} is not a directory".format(image_dir)
+
         if not os.path.exists(output_dir): os.makedirs(output_dir)
         result_root = os.path.join(output_dir, 'mot_results')
         if not os.path.exists(result_root): os.makedirs(result_root)
@@ -381,13 +396,26 @@ class Tracker(object):
         assert model_type in ['JDE', 'DeepSORT', 'FairMOT'], \
             "model_type should be 'JDE', 'DeepSORT' or 'FairMOT'"
 
-        # run tracking
-        seq = video_file.split('/')[-1].split('.')[0]
+        # run tracking        
+        if video_file:
+            seq = video_file.split('/')[-1].split('.')[0]
+            self.dataset.set_video(video_file)
+            logger.info('Starting tracking video {}'.format(video_file))
+        elif image_dir:
+            seq = image_dir.split('/')[-1].split('.')[0]
+            images = [
+                '{}/{}'.format(image_dir, x) for x in os.listdir(image_dir)
+            ]
+            images.sort()
+            self.dataset.set_images(images)
+            logger.info('Starting tracking folder {}, found {} images'.format(
+                image_dir, len(images)))
+        else:
+            raise ValueError('--video_file or --image_dir should be set.')
+
         save_dir = os.path.join(output_dir, 'mot_outputs',
                                 seq) if save_images or save_videos else None
-        logger.info('Starting tracking {}'.format(video_file))
 
-        self.dataset.set_video(video_file)
         dataloader = create('TestMOTReader')(self.dataset, 0)
         result_filename = os.path.join(result_root, '{}.txt'.format(seq))
         frame_rate = self.dataset.frame_rate
diff --git a/requirements.txt b/requirements.txt
index 7ac38c2e2..e4009c30f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,3 @@ lap
 sklearn
 motmetrics
 openpyxl
-decord
diff --git a/tools/eval_mot.py b/tools/eval_mot.py
index cd608678a..9d4a9b22a 100644
--- a/tools/eval_mot.py
+++ b/tools/eval_mot.py
@@ -73,11 +73,11 @@ def parse_args():
 
 
 def run(FLAGS, cfg):
-    task = cfg['EvalMOTDataset'].task
     dataset_dir = cfg['EvalMOTDataset'].dataset_dir
     data_root = cfg['EvalMOTDataset'].data_root
     data_root = '{}/{}'.format(dataset_dir, data_root)
-    seqs = cfg['MOTDataZoo'][task]
+    seqs = os.listdir(data_root)
+    seqs.sort()
 
     # build Tracker
     tracker = Tracker(cfg, mode='eval')
diff --git a/tools/infer_mot.py b/tools/infer_mot.py
index 044d1e685..57d7e6dff 100644
--- a/tools/infer_mot.py
+++ b/tools/infer_mot.py
@@ -43,6 +43,11 @@ def parse_args():
     parser = ArgsParser()
     parser.add_argument(
         '--video_file', type=str, default=None, help='Video name for tracking.')
+    parser.add_argument(
+        "--image_dir",
+        type=str,
+        default=None,
+        help="Directory for images to perform inference on.")
     parser.add_argument(
         "--data_type",
         type=str,
@@ -95,6 +100,7 @@ def run(FLAGS, cfg):
     # inference
     tracker.mot_predict(
         video_file=FLAGS.video_file,
+        image_dir=FLAGS.image_dir,
         data_type=FLAGS.data_type,
         model_type=cfg.architecture,
         output_dir=FLAGS.output_dir,
-- 
GitLab