Fix timer in deploy (#4817)

* fix timer in deploy * fix mot_keypoint deploy

Fix timer in deploy (#4817)
* fix timer in deploy * fix mot_keypoint deploy
9a0f2887 · wangguanzhong · GitHub · 8ad63b1a · 9a0f2887 · 9a0f2887
8 changed file
--- a/deploy/pptracking/python/mot_jde_infer.py
+++ b/deploy/pptracking/python/mot_jde_infer.py
@@ -121,32 +121,32 @@ class JDE_Detector(Detector):
                online_scores[cls_id].append(tscore)
        return online_tlwhs, online_scores, online_ids
-    def predict(self, image_list, threshold=0.5, warmup=0, repeats=1):
+    def predict(self, image_list, threshold=0.5, repeats=1, add_timer=True):
        '''
        Args:
            image_list (list[str]): path of images, only support one image path
                (batch_size=1) in tracking model
            threshold (float): threshold of predicted box' score
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            online_tlwhs, online_scores, online_ids (dict[np.array])
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image_list)
-        self.det_times.preprocess_time_s.end()
        pred_dets, pred_embs = None, None
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
+        if add_timer:
-        for i in range(warmup):
+            self.det_times.preprocess_time_s.end()
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            pred_dets = boxes_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            output_names = self.predictor.get_output_names()
@@ -154,14 +154,16 @@ class JDE_Detector(Detector):
            pred_dets = boxes_tensor.copy_to_cpu()
            embs_tensor = self.predictor.get_output_handle(output_names[1])
            pred_embs = embs_tensor.copy_to_cpu()
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        online_tlwhs, online_scores, online_ids = self.postprocess(
            pred_dets, pred_embs, threshold)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += 1
        return online_tlwhs, online_scores, online_ids
@@ -175,7 +177,12 @@ def predict_image(detector, image_list):
    for frame_id, img_file in enumerate(image_list):
        frame = cv2.imread(img_file)
        if FLAGS.run_benchmark:
-            detector.predict([img_file], FLAGS.threshold, warmup=10, repeats=10)
+            # warmup
+            detector.predict(
+                [img_file], FLAGS.threshold, repeats=10, add_timer=False)
+            # run benchmark
+            detector.predict(
+                [img_file], FLAGS.threshold, repeats=10, add_timer=True)
            cm, gm, gu = get_current_memory_mb()
            detector.cpu_mem += cm
            detector.gpu_mem += gm

--- a/deploy/pptracking/python/mot_sde_infer.py
+++ b/deploy/pptracking/python/mot_sde_infer.py
@@ -154,8 +154,8 @@ class SDE_Detector(Detector):
                ori_image_shape,
                threshold=0.5,
                scaled=False,
-                warmup=0,
+                repeats=1,
-                repeats=1):
+                add_timer=True):
        '''
        Args:
            image_path (list[str]): path of images, only support one image path
@@ -164,43 +164,46 @@ class SDE_Detector(Detector):
            threshold (float): threshold of predicted box' score
            scaled (bool): whether the coords after detector outputs are scaled,
                default False in jde yolov3, set True in general detector.
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            pred_dets (np.ndarray, [N, 6]): 'x,y,w,h,score,cls_id'
            pred_xyxys (np.ndarray, [N, 4]): 'x1,y1,x2,y2'
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image_path)
-        self.det_times.preprocess_time_s.end()
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
+        if add_timer:
-        for i in range(warmup):
+            self.det_times.preprocess_time_s.end()
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            boxes = boxes_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            output_names = self.predictor.get_output_names()
            boxes_tensor = self.predictor.get_output_handle(output_names[0])
            boxes = boxes_tensor.copy_to_cpu()
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        if len(boxes) == 0:
            pred_dets = np.zeros((1, 6), dtype=np.float32)
            pred_xyxys = np.zeros((1, 4), dtype=np.float32)
        else:
            pred_dets, pred_xyxys = self.postprocess(
                boxes, ori_image_shape, threshold, inputs, scaled=scaled)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += 1
        return pred_dets, pred_xyxys
@@ -284,8 +287,8 @@ class SDE_DetectorPicoDet(DetectorPicoDet):
                ori_image_shape,
                threshold=0.5,
                scaled=False,
-                warmup=0,
+                repeats=1,
-                repeats=1):
+                add_timer=True):
        '''
        Args:
            image_path (list[str]): path of images, only support one image path
@@ -294,27 +297,26 @@ class SDE_DetectorPicoDet(DetectorPicoDet):
            threshold (float): threshold of predicted box' score
            scaled (bool): whether the coords after detector outputs are scaled,
                default False in jde yolov3, set True in general detector.
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            pred_dets (np.ndarray, [N, 6]): 'x,y,w,h,score,cls_id'
            pred_xyxys (np.ndarray, [N, 4]): 'x1,y1,x2,y2'
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image_path)
-        self.det_times.preprocess_time_s.end()
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
+        if add_timer:
-        np_score_list, np_boxes_list = [], []
+            self.det_times.preprocess_time_s.end()
-        for i in range(warmup):
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            boxes = boxes_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            np_score_list.clear()
@@ -328,9 +330,11 @@ class SDE_DetectorPicoDet(DetectorPicoDet):
                np_boxes_list.append(
                    self.predictor.get_output_handle(output_names[
                        out_idx + num_outs]).copy_to_cpu())
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        self.picodet_postprocess = PicoDetPostProcess(
            inputs['image'].shape[2:],
            inputs['im_shape'],
@@ -346,6 +350,7 @@ class SDE_DetectorPicoDet(DetectorPicoDet):
        else:
            pred_dets, pred_xyxys = self.postprocess(boxes, ori_image_shape,
                                                     threshold)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += 1
@@ -503,40 +508,41 @@ class SDE_ReID(object):
    def predict(self,
                crops,
                pred_dets,
-                warmup=0,
                repeats=1,
+                add_timer=True,
                MTMCT=False,
                frame_id=0,
                seq_name=''):
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(crops)
-        self.det_times.preprocess_time_s.end()
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        for i in range(warmup):
+        if add_timer:
-            self.predictor.run()
+            self.det_times.preprocess_time_s.end()
-            output_names = self.predictor.get_output_names()
-            feature_tensor = self.predictor.get_output_handle(output_names[0])
-            pred_embs = feature_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            output_names = self.predictor.get_output_names()
            feature_tensor = self.predictor.get_output_handle(output_names[0])
            pred_embs = feature_tensor.copy_to_cpu()
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        if MTMCT == False:
            tracking_outs = self.postprocess(pred_dets, pred_embs)
        else:
            tracking_outs = self.postprocess_mtmct(pred_dets, pred_embs,
                                                   frame_id, seq_name)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += 1
@@ -549,13 +555,23 @@ def predict_image(detector, reid_model, image_list):
        frame = cv2.imread(img_file)
        ori_image_shape = list(frame.shape[:2])
        if FLAGS.run_benchmark:
+            # warmup
            pred_dets, pred_xyxys = detector.predict(
                [img_file],
                ori_image_shape,
                FLAGS.threshold,
                FLAGS.scaled,
-                warmup=10,
+                repeats=10,
-                repeats=10)
+                add_timer=False)
+            # run benchmark
+            pred_dets, pred_xyxys = detector.predict(
+                [img_file],
+                ori_image_shape,
+                FLAGS.threshold,
+                FLAGS.scaled,
+                repeats=10,
+                add_timer=True)
            cm, gm, gu = get_current_memory_mb()
            detector.cpu_mem += cm
            detector.gpu_mem += gm
@@ -574,8 +590,13 @@ def predict_image(detector, reid_model, image_list):
            crops = reid_model.get_crops(pred_xyxys, frame)
            if FLAGS.run_benchmark:
+                # warmup
                tracking_outs = reid_model.predict(
-                    crops, pred_dets, warmup=10, repeats=10)
+                    crops, pred_dets, repeats=10, add_timer=False)
+                # run benchmark 
+                tracking_outs = reid_model.predict(
+                    crops, pred_dets, repeats=10, add_timer=True)
            else:
                tracking_outs = reid_model.predict(crops, pred_dets)

--- a/deploy/python/det_keypoint_unite_infer.py
+++ b/deploy/python/det_keypoint_unite_infer.py
@@ -68,8 +68,12 @@ def predict_with_given_det(image, det_res, keypoint_detector,
        batch_images = rec_images[start_index:end_index]
        batch_records = np.array(records[start_index:end_index])
        if run_benchmark:
+            # warmup
            keypoint_result = keypoint_detector.predict(
-                batch_images, keypoint_threshold, warmup=10, repeats=10)
+                batch_images, keypoint_threshold, repeats=10, add_timer=False)
+            # run benchmark
+            keypoint_result = keypoint_detector.predict(
+                batch_images, keypoint_threshold, repeats=10, add_timer=True)
        else:
            keypoint_result = keypoint_detector.predict(batch_images,
                                                        keypoint_threshold)
@@ -100,8 +104,12 @@ def topdown_unite_predict(detector,
        det_timer.preprocess_time_s.end()
        if FLAGS.run_benchmark:
+            # warmup
+            results = detector.predict(
+                [image], FLAGS.det_threshold, repeats=10, add_timer=False)
+            # run benchmark
            results = detector.predict(
-                [image], FLAGS.det_threshold, warmup=10, repeats=10)
+                [image], FLAGS.det_threshold, repeats=10, add_timer=True)
            cm, gm, gu = get_current_memory_mb()
            detector.cpu_mem += cm
            detector.gpu_mem += gm

--- a/deploy/python/infer.py
+++ b/deploy/python/infer.py
@@ -126,35 +126,33 @@ class Detector(object):
            results['masks'] = np_masks
        return results
-    def predict(self, image_list, threshold=0.5, warmup=0, repeats=1):
+    def predict(self, image_list, threshold=0.5, repeats=1, add_timer=True):
        '''
        Args:
            image_list (list): list of image
            threshold (float): threshold of predicted box' score
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
                            matix element:[class, score, x_min, y_min, x_max, y_max]
                            MaskRCNN's results include 'masks': np.ndarray:
                            shape: [N, im_h, im_w]
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image_list)
-        self.det_times.preprocess_time_s.end()
        np_boxes, np_masks = None, None
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        for i in range(warmup):
+        if add_timer:
-            self.predictor.run()
+            self.det_times.preprocess_time_s.end()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            np_boxes = boxes_tensor.copy_to_cpu()
-            if self.pred_config.mask:
-                masks_tensor = self.predictor.get_output_handle(output_names[2])
-                np_masks = masks_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            output_names = self.predictor.get_output_names()
@@ -165,9 +163,12 @@ class Detector(object):
            if self.pred_config.mask:
                masks_tensor = self.predictor.get_output_handle(output_names[2])
                np_masks = masks_tensor.copy_to_cpu()
-        self.det_times.inference_time_s.end(repeats=repeats)
+        if add_timer:
+            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        results = []
        if reduce(lambda x, y: x * y, np_boxes.shape) < 6:
            print('[WARNNING] No object detected.')
@@ -175,6 +176,7 @@ class Detector(object):
        else:
            results = self.postprocess(
                np_boxes, np_masks, inputs, np_boxes_num, threshold=threshold)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += len(image_list)
        return results
@@ -229,35 +231,29 @@ class DetectorSOLOv2(Detector):
        self.det_times = Timer()
        self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
-    def predict(self, image, threshold=0.5, warmup=0, repeats=1):
+    def predict(self, image, threshold=0.5, repeats=1, add_timer=True):
        '''
        Args:
            image (str/np.ndarray): path of image/ np.ndarray read by cv2
            threshold (float): threshold of predicted box' score
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            results (dict): 'segm': np.ndarray,shape:[N, im_h, im_w]
                            'cate_label': label of segm, shape:[N]
                            'cate_score': confidence score of segm, shape:[N]
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image)
-        self.det_times.preprocess_time_s.end()
        np_label, np_score, np_segms = None, None, None
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        for i in range(warmup):
+        if add_timer:
-            self.predictor.run()
+            self.det_times.preprocess_time_s.end()
-            output_names = self.predictor.get_output_names()
-            np_boxes_num = self.predictor.get_output_handle(output_names[
-                0]).copy_to_cpu()
-            np_label = self.predictor.get_output_handle(output_names[
-                1]).copy_to_cpu()
-            np_score = self.predictor.get_output_handle(output_names[
-                2]).copy_to_cpu()
-            np_segms = self.predictor.get_output_handle(output_names[
-                3]).copy_to_cpu()
            self.det_times.inference_time_s.start()
        for i in range(repeats):
            self.predictor.run()
@@ -270,6 +266,7 @@ class DetectorSOLOv2(Detector):
                2]).copy_to_cpu()
            np_segms = self.predictor.get_output_handle(output_names[
                3]).copy_to_cpu()
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.img_num += 1
@@ -326,38 +323,32 @@ class DetectorPicoDet(Detector):
        self.det_times = Timer()
        self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
-    def predict(self, image, threshold=0.5, warmup=0, repeats=1):
+    def predict(self, image, threshold=0.5, repeats=1, add_timer=True):
        '''
        Args:
            image (str/np.ndarray): path of image/ np.ndarray read by cv2
            threshold (float): threshold of predicted box' score
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
                            matix element:[class, score, x_min, y_min, x_max, y_max]
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image)
-        self.det_times.preprocess_time_s.end()
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        np_score_list, np_boxes_list = [], []
-        for i in range(warmup):
-            self.predictor.run()
-            np_score_list.clear()
-            np_boxes_list.clear()
-            output_names = self.predictor.get_output_names()
-            num_outs = int(len(output_names) / 2)
-            for out_idx in range(num_outs):
-                np_score_list.append(
-                    self.predictor.get_output_handle(output_names[out_idx])
-                    .copy_to_cpu())
-                np_boxes_list.append(
-                    self.predictor.get_output_handle(output_names[
-                        out_idx + num_outs]).copy_to_cpu())
+        np_score_list, np_boxes_list = [], []
+        if add_timer:
+            self.det_times.preprocess_time_s.end()
            self.det_times.inference_time_s.start()
+        # model_prediction
        for i in range(repeats):
            self.predictor.run()
            np_score_list.clear()
@@ -371,9 +362,12 @@ class DetectorPicoDet(Detector):
                np_boxes_list.append(
                    self.predictor.get_output_handle(output_names[
                        out_idx + num_outs]).copy_to_cpu())
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.img_num += 1
            self.det_times.postprocess_time_s.start()
+        # postprocess
        self.postprocess = PicoDetPostProcess(
            inputs['image'].shape[2:],
            inputs['im_shape'],
@@ -381,6 +375,7 @@ class DetectorPicoDet(Detector):
            strides=self.pred_config.fpn_stride,
            nms_threshold=self.pred_config.nms['nms_threshold'])
        np_boxes, np_boxes_num = self.postprocess(np_score_list, np_boxes_list)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
        return dict(boxes=np_boxes, boxes_num=np_boxes_num)
@@ -647,8 +642,13 @@ def predict_image(detector, image_list, batch_size=1):
        end_index = min((i + 1) * batch_size, len(image_list))
        batch_image_list = image_list[start_index:end_index]
        if FLAGS.run_benchmark:
+            # warmup
            detector.predict(
-                batch_image_list, FLAGS.threshold, warmup=10, repeats=10)
+                batch_image_list, FLAGS.threshold, repeats=10, add_timer=False)
+            # run benchmark
+            detector.predict(
+                batch_image_list, FLAGS.threshold, repeats=10, add_timer=True)
            cm, gm, gu = get_current_memory_mb()
            detector.cpu_mem += cm
            detector.gpu_mem += gm
@@ -681,7 +681,7 @@ def predict_video(detector, camera_id):
    if not os.path.exists(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)
    out_path = os.path.join(FLAGS.output_dir, video_out_name)
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
    index = 1
    while (1):

--- a/deploy/python/keypoint_infer.py
+++ b/deploy/python/keypoint_infer.py
@@ -145,41 +145,33 @@ class KeyPoint_Detector(Detector):
            raise ValueError("Unsupported arch: {}, expect {}".format(
                self.pred_config.arch, KEYPOINT_SUPPORT_MODELS))
-    def predict(self, image_list, threshold=0.5, warmup=0, repeats=1):
+    def predict(self, image_list, threshold=0.5, repeats=1, add_timer=True):
        '''
        Args:
            image_list (list): list of image 
            threshold (float): threshold of predicted box' score
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
                            matix element:[class, score, x_min, y_min, x_max, y_max]
                            MaskRCNN's results include 'masks': np.ndarray:
                            shape: [N, im_h, im_w]
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image_list)
        np_boxes, np_masks = None, None
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
+        if add_timer:
            self.det_times.preprocess_time_s.end()
-        for i in range(warmup):
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            np_boxes = boxes_tensor.copy_to_cpu()
-            if self.pred_config.tagmap:
-                masks_tensor = self.predictor.get_output_handle(output_names[1])
-                heat_k = self.predictor.get_output_handle(output_names[2])
-                inds_k = self.predictor.get_output_handle(output_names[3])
-                np_masks = [
-                    masks_tensor.copy_to_cpu(), heat_k.copy_to_cpu(),
-                    inds_k.copy_to_cpu()
-                ]
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            output_names = self.predictor.get_output_names()
@@ -193,11 +185,14 @@ class KeyPoint_Detector(Detector):
                    masks_tensor.copy_to_cpu(), heat_k.copy_to_cpu(),
                    inds_k.copy_to_cpu()
                ]
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        results = self.postprocess(
            np_boxes, np_masks, inputs, threshold=threshold)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += len(image_list)
        return results
@@ -266,7 +261,12 @@ class PredictConfig_KeyPoint():
 def predict_image(detector, image_list):
    for i, img_file in enumerate(image_list):
        if FLAGS.run_benchmark:
-            detector.predict([img_file], FLAGS.threshold, warmup=10, repeats=10)
+            # warmup 
+            detector.predict(
+                [img_file], FLAGS.threshold, repeats=10, add_timer=False)
+            # run benchmark
+            detector.predict(
+                [img_file], FLAGS.threshold, repeats=10, add_timer=True)
            cm, gm, gu = get_current_memory_mb()
            detector.cpu_mem += cm
            detector.gpu_mem += gm
@@ -300,7 +300,7 @@ def predict_video(detector, camera_id):
    if not os.path.exists(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)
    out_path = os.path.join(FLAGS.output_dir, video_name + '.mp4')
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
    index = 1
    while (1):

--- a/deploy/python/mot_jde_infer.py
+++ b/deploy/python/mot_jde_infer.py
@@ -120,31 +120,31 @@ class JDE_Detector(Detector):
                online_scores[cls_id].append(tscore)
        return online_tlwhs, online_scores, online_ids
-    def predict(self, image_list, threshold=0.5, warmup=0, repeats=1):
+    def predict(self, image_list, threshold=0.5, repeats=1, add_timer=True):
        '''
        Args:
            image_list (list): list of image
            threshold (float): threshold of predicted box' score
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            online_tlwhs, online_scores, online_ids (dict[np.array])
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image_list)
-        self.det_times.preprocess_time_s.end()
        pred_dets, pred_embs = None, None
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
+        if add_timer:
-        for i in range(warmup):
+            self.det_times.preprocess_time_s.end()
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            pred_dets = boxes_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            output_names = self.predictor.get_output_names()
@@ -152,11 +152,15 @@ class JDE_Detector(Detector):
            pred_dets = boxes_tensor.copy_to_cpu()
            embs_tensor = self.predictor.get_output_handle(output_names[1])
            pred_embs = embs_tensor.copy_to_cpu()
-        self.det_times.inference_time_s.end(repeats=repeats)
+        if add_timer:
+            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        online_tlwhs, online_scores, online_ids = self.postprocess(
            pred_dets, pred_embs, threshold)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += 1
        return online_tlwhs, online_scores, online_ids
@@ -172,7 +176,12 @@ def predict_image(detector, image_list):
    for frame_id, img_file in enumerate(image_list):
        frame = cv2.imread(img_file)
        if FLAGS.run_benchmark:
-            detector.predict([frame], FLAGS.threshold, warmup=10, repeats=10)
+            # warmup
+            detector.predict(
+                [frame], FLAGS.threshold, repeats=10, add_timer=False)
+            # run benchmark
+            detector.predict(
+                [frame], FLAGS.threshold, repeats=10, add_timer=True)
            cm, gm, gu = get_current_memory_mb()
            detector.cpu_mem += cm
            detector.gpu_mem += gm
@@ -181,8 +190,13 @@ def predict_image(detector, image_list):
        else:
            online_tlwhs, online_scores, online_ids = detector.predict(
                [frame], FLAGS.threshold)
-            online_im = plot_tracking_dict(frame, num_classes, online_tlwhs,
+            online_im = plot_tracking_dict(
-                                           online_ids, online_scores, frame_id,
+                frame,
+                num_classes,
+                online_tlwhs,
+                online_ids,
+                online_scores,
+                frame_id,
                ids2names=ids2names)
            if FLAGS.save_images:
                if not os.path.exists(FLAGS.output_dir):
@@ -211,7 +225,7 @@ def predict_video(detector, camera_id):
        os.makedirs(FLAGS.output_dir)
    out_path = os.path.join(FLAGS.output_dir, video_name)
    if not FLAGS.save_images:
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
    frame_id = 0
    timer = MOTTimer()

--- a/deploy/python/mot_keypoint_unite_infer.py
+++ b/deploy/python/mot_keypoint_unite_infer.py
@@ -64,8 +64,12 @@ def mot_keypoint_unite_predict_image(mot_model,
        frame = cv2.imread(img_file)
        if FLAGS.run_benchmark:
+            # warmup
            online_tlwhs, online_scores, online_ids = mot_model.predict(
-                [frame], FLAGS.mot_threshold, warmup=10, repeats=10)
+                [frame], FLAGS.mot_threshold, repeats=10, add_timer=False)
+            # run benchmark
+            online_tlwhs, online_scores, online_ids = mot_model.predict(
+                [frame], FLAGS.mot_threshold, repeats=10, add_timer=True)
            cm, gm, gu = get_current_memory_mb()
            mot_model.cpu_mem += cm
            mot_model.gpu_mem += gm
@@ -84,13 +88,16 @@ def mot_keypoint_unite_predict_image(mot_model,
                FLAGS.run_benchmark)
        else:
-            warmup = 10 if FLAGS.run_benchmark else 0
+            if FLAGS.run_benchmark:
-            repeats = 10 if FLAGS.run_benchmark else 1
                keypoint_results = keypoint_model.predict(
                    [frame],
                    FLAGS.keypoint_threshold,
-                warmup=warmup,
+                    repeats=10,
-                repeats=repeats)
+                    add_timer=False)
+            repeats = 10 if FLAGS.run_benchmark else 1
+            keypoint_results = keypoint_model.predict(
+                [frame], FLAGS.keypoint_threshold, repeats=repeats)
        if FLAGS.run_benchmark:
            cm, gm, gu = get_current_memory_mb()
@@ -103,7 +110,7 @@ def mot_keypoint_unite_predict_image(mot_model,
                keypoint_results,
                visual_thread=FLAGS.keypoint_threshold,
                returnimg=True,
-                ids=online_ids
+                ids=online_ids[0]
                if KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown'
                else None)
@@ -144,7 +151,7 @@ def mot_keypoint_unite_predict_video(mot_model,
        os.makedirs(FLAGS.output_dir)
    out_path = os.path.join(FLAGS.output_dir, video_name)
    if not FLAGS.save_images:
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
    frame_id = 0
    timer_mot = FPSTimer()
@@ -193,7 +200,7 @@ def mot_keypoint_unite_predict_video(mot_model,
            keypoint_results,
            visual_thread=FLAGS.keypoint_threshold,
            returnimg=True,
-            ids=online_ids
+            ids=online_ids[0]
            if KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown' else
            None)

--- a/deploy/python/mot_sde_infer.py
+++ b/deploy/python/mot_sde_infer.py
@@ -178,40 +178,43 @@ class SDE_Detector(Detector):
        return pred_dets, pred_xyxys
-    def predict(self, image, scaled, threshold=0.5, warmup=0, repeats=1):
+    def predict(self, image, scaled, threshold=0.5, repeats=1, add_timer=True):
        '''
        Args:
            image (np.ndarray): image numpy data
-            threshold (float): threshold of predicted box' score
            scaled (bool): whether the coords after detector outputs are scaled,
                default False in jde yolov3, set True in general detector.
+            threshold (float): threshold of predicted box' score
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            pred_dets (np.ndarray, [N, 6])
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image)
-        self.det_times.preprocess_time_s.end()
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        for i in range(warmup):
+        if add_timer:
-            self.predictor.run()
+            self.det_times.preprocess_time_s.end()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            boxes = boxes_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            output_names = self.predictor.get_output_names()
            boxes_tensor = self.predictor.get_output_handle(output_names[0])
            boxes = boxes_tensor.copy_to_cpu()
-        self.det_times.inference_time_s.end(repeats=repeats)
+        if add_timer:
+            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        if len(boxes) == 0:
            pred_dets = np.zeros((1, 6), dtype=np.float32)
            pred_xyxys = np.zeros((1, 4), dtype=np.float32)
@@ -223,6 +226,7 @@ class SDE_Detector(Detector):
            pred_dets, pred_xyxys = self.postprocess(
                boxes, input_shape, im_shape, scale_factor, threshold, scaled)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += 1
        return pred_dets, pred_xyxys
@@ -271,7 +275,8 @@ class SDE_DetectorPicoDet(DetectorPicoDet):
        assert batch_size == 1, "The JDE Detector only supports batch size=1 now"
        self.pred_config = pred_config
-    def postprocess_bboxes(self, boxes, input_shape, im_shape, scale_factor, threshold):
+    def postprocess_bboxes(self, boxes, input_shape, im_shape, scale_factor,
+                           threshold):
        over_thres_idx = np.nonzero(boxes[:, 1:2] >= threshold)[0]
        if len(over_thres_idx) == 0:
            pred_dets = np.zeros((1, 6), dtype=np.float32)
@@ -299,33 +304,35 @@ class SDE_DetectorPicoDet(DetectorPicoDet):
            (pred_tlwhs, pred_scores, pred_cls_ids), axis=1)
        return pred_dets, pred_xyxys
-    def predict(self, image, scaled, threshold=0.5, warmup=0, repeats=1):
+    def predict(self, image, scaled, threshold=0.5, repeats=1, add_timer=True):
        '''
        Args:
            image (np.ndarray): image numpy data
-            threshold (float): threshold of predicted box' score
            scaled (bool): whether the coords after detector outputs are scaled,
                default False in jde yolov3, set True in general detector.
+            threshold (float): threshold of predicted box' score
+            repeats (int): repeat number for prediction
+            add_timer (bool): whether add timer during prediction
        Returns:
            pred_dets (np.ndarray, [N, 6])
        '''
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(image)
-        self.det_times.preprocess_time_s.end()
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        np_score_list, np_boxes_list = [], []
+        if add_timer:
-        for i in range(warmup):
+            self.det_times.preprocess_time_s.end()
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            boxes = boxes_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
+        np_score_list, np_boxes_list = [], []
        for i in range(repeats):
            self.predictor.run()
            np_score_list.clear()
@@ -340,9 +347,12 @@ class SDE_DetectorPicoDet(DetectorPicoDet):
                    self.predictor.get_output_handle(output_names[
                        out_idx + num_outs]).copy_to_cpu())
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.img_num += 1
            self.det_times.postprocess_time_s.start()
+        # postprocess
        self.postprocess = PicoDetPostProcess(
            inputs['image'].shape[2:],
            inputs['im_shape'],
@@ -360,7 +370,8 @@ class SDE_DetectorPicoDet(DetectorPicoDet):
            scale_factor = inputs['scale_factor']
            pred_dets, pred_xyxys = self.postprocess_bboxes(
                boxes, input_shape, im_shape, scale_factor, threshold)
+        if add_timer:
+            self.det_times.postprocess_time_s.end()
        return pred_dets, pred_xyxys
@@ -445,33 +456,34 @@ class SDE_ReID(object):
        return online_tlwhs, online_scores, online_ids
-    def predict(self, crops, pred_dets, warmup=0, repeats=1):
+    def predict(self, crops, pred_dets, repeats=1, add_timer=True):
+        # preprocess
+        if add_timer:
            self.det_times.preprocess_time_s.start()
        inputs = self.preprocess(crops)
-        self.det_times.preprocess_time_s.end()
        input_names = self.predictor.get_input_names()
        for i in range(len(input_names)):
            input_tensor = self.predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(inputs[input_names[i]])
+        if add_timer:
-        for i in range(warmup):
+            self.det_times.preprocess_time_s.end()
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            feature_tensor = self.predictor.get_output_handle(output_names[0])
-            pred_embs = feature_tensor.copy_to_cpu()
            self.det_times.inference_time_s.start()
+        # model prediction
        for i in range(repeats):
            self.predictor.run()
            output_names = self.predictor.get_output_names()
            feature_tensor = self.predictor.get_output_handle(output_names[0])
            pred_embs = feature_tensor.copy_to_cpu()
+        if add_timer:
            self.det_times.inference_time_s.end(repeats=repeats)
            self.det_times.postprocess_time_s.start()
+        # postprocess
        online_tlwhs, online_scores, online_ids = self.postprocess(pred_dets,
                                                                   pred_embs)
+        if add_timer:
            self.det_times.postprocess_time_s.end()
            self.det_times.img_num += 1
@@ -483,8 +495,20 @@ def predict_image(detector, reid_model, image_list):
    for i, img_file in enumerate(image_list):
        frame = cv2.imread(img_file)
        if FLAGS.run_benchmark:
+            # warmup
            pred_dets, pred_xyxys = detector.predict(
-                [frame], FLAGS.scaled, FLAGS.threshold, warmup=10, repeats=10)
+                [frame],
+                FLAGS.scaled,
+                FLAGS.threshold,
+                repeats=10,
+                add_timer=True)
+            # run benchmark
+            pred_dets, pred_xyxys = detector.predict(
+                [frame],
+                FLAGS.scaled,
+                FLAGS.threshold,
+                repeats=10,
+                add_timer=True)
            cm, gm, gu = get_current_memory_mb()
            detector.cpu_mem += cm
            detector.gpu_mem += gm
@@ -503,8 +527,12 @@ def predict_image(detector, reid_model, image_list):
            crops = reid_model.get_crops(pred_xyxys, frame)
            if FLAGS.run_benchmark:
+                # warmup
+                online_tlwhs, online_scores, online_ids = reid_model.predict(
+                    crops, pred_dets, repeats=10, add_timer=False)
+                # run benchmark
                online_tlwhs, online_scores, online_ids = reid_model.predict(
-                    crops, pred_dets, warmup=10, repeats=10)
+                    crops, pred_dets, repeats=10, add_timer=False)
            else:
                online_tlwhs, online_scores, online_ids = reid_model.predict(
                    crops, pred_dets)
@@ -538,7 +566,7 @@ def predict_video(detector, reid_model, camera_id):
        os.makedirs(FLAGS.output_dir)
    out_path = os.path.join(FLAGS.output_dir, video_name)
    if not FLAGS.save_images:
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
    frame_id = 0
    timer = MOTTimer()