From 7dccc8f63eb412efe6df1e2af46d6733e42d3137 Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Thu, 3 Mar 2022 16:10:01 +0800
Subject: [PATCH] Refactor python deploy (#5253)

* refactor det deploy

* refactor keypoint deploy

* fix solov2

* fit mot sde pipeline infer

* refine mot sde infer

* fit mot jde infer pipeline

* fit mot pose unite infer

* precommit for format

* refine keypoint detector name

* clean codes

* fix keypoint infer

* refine format

Co-authored-by: Feng Ni <nemonameless@qq.com>
---
 deploy/python/benchmark_utils.py          |  18 +-
 deploy/python/det_keypoint_unite_infer.py | 105 +--
 deploy/python/infer.py                    | 546 +++++++-------
 deploy/python/keypoint_infer.py           | 277 ++++----
 deploy/python/keypoint_postprocess.py     |   3 +-
 deploy/python/mot_jde_infer.py            | 392 +++++-----
 deploy/python/mot_keypoint_unite_infer.py | 338 +++++----
 deploy/python/mot_keypoint_unite_utils.py |  14 +-
 deploy/python/mot_sde_infer.py            | 824 ++++++----------------
 deploy/python/tracker_config.yml          |  10 +
 deploy/python/utils.py                    |  74 +-
 deploy/python/visualize.py                |  21 +-
 12 files changed, 1154 insertions(+), 1468 deletions(-)
 create mode 100644 deploy/python/tracker_config.yml

diff --git a/deploy/python/benchmark_utils.py b/deploy/python/benchmark_utils.py
index af7637288..adf362179 100644
--- a/deploy/python/benchmark_utils.py
+++ b/deploy/python/benchmark_utils.py
@@ -89,6 +89,8 @@ class PaddleInferBenchmark(object):
 
         self.preprocess_time_s = perf_info.get('preprocess_time_s', 0)
         self.postprocess_time_s = perf_info.get('postprocess_time_s', 0)
+        self.with_tracker = True if 'tracking_time_s' in perf_info else False
+        self.tracking_time_s = perf_info.get('tracking_time_s', 0)
         self.total_time_s = perf_info.get('total_time_s', 0)
 
         self.inference_time_s_90 = perf_info.get("inference_time_s_90", "")
@@ -235,9 +237,19 @@ class PaddleInferBenchmark(object):
         )
         self.logger.info(
             f"{identifier} total time spent(s): {self.total_time_s}")
-        self.logger.info(
-            f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, inference_time(ms): {round(self.inference_time_s*1000, 1)}, postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}"
-        )
+
+        if self.with_tracker:
+            self.logger.info(
+                f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
+                f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
+                f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}, "
+                f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}")
+        else:
+            self.logger.info(
+                f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
+                f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
+                f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}"
+            )
         if self.inference_time_s_90:
             self.looger.info(
                 f"{identifier} 90%_cost: {self.inference_time_s_90}, 99%_cost: {self.inference_time_s_99}, succ_rate: {self.succ_rate}"
diff --git a/deploy/python/det_keypoint_unite_infer.py b/deploy/python/det_keypoint_unite_infer.py
index a695a9f0f..a82c2c58c 100644
--- a/deploy/python/det_keypoint_unite_infer.py
+++ b/deploy/python/det_keypoint_unite_infer.py
@@ -18,12 +18,13 @@ import cv2
 import math
 import numpy as np
 import paddle
+import yaml
 
 from det_keypoint_unite_utils import argsparser
 from preprocess import decode_image
-from infer import Detector, DetectorPicoDet, PredictConfig, print_arguments, get_test_images
-from keypoint_infer import KeyPoint_Detector, PredictConfig_KeyPoint
-from visualize import draw_pose
+from infer import Detector, DetectorPicoDet, PredictConfig, print_arguments, get_test_images, bench_log
+from keypoint_infer import KeyPointDetector, PredictConfig_KeyPoint
+from visualize import visualize_pose
 from benchmark_utils import PaddleInferBenchmark
 from utils import get_current_memory_mb
 from keypoint_postprocess import translate_to_ori_images
@@ -34,24 +35,6 @@ KEYPOINT_SUPPORT_MODELS = {
 }
 
 
-def bench_log(detector, img_list, model_info, batch_size=1, name=None):
-    mems = {
-        'cpu_rss_mb': detector.cpu_mem / len(img_list),
-        'gpu_rss_mb': detector.gpu_mem / len(img_list),
-        'gpu_util': detector.gpu_util * 100 / len(img_list)
-    }
-    perf_info = detector.det_times.report(average=True)
-    data_info = {
-        'batch_size': batch_size,
-        'shape': "dynamic_shape",
-        'data_num': perf_info['img_num']
-    }
-
-    log = PaddleInferBenchmark(detector.config, model_info, data_info,
-                               perf_info, mems)
-    log(name)
-
-
 def predict_with_given_det(image, det_res, keypoint_detector,
                            keypoint_batch_size, det_threshold,
                            keypoint_threshold, run_benchmark):
@@ -59,32 +42,15 @@ def predict_with_given_det(image, det_res, keypoint_detector,
         image, det_res, det_threshold)
     keypoint_vector = []
     score_vector = []
-    rect_vector = det_rects
-    batch_loop_cnt = math.ceil(float(len(rec_images)) / keypoint_batch_size)
-
-    for i in range(batch_loop_cnt):
-        start_index = i * keypoint_batch_size
-        end_index = min((i + 1) * keypoint_batch_size, len(rec_images))
-        batch_images = rec_images[start_index:end_index]
-        batch_records = np.array(records[start_index:end_index])
-        if run_benchmark:
-            # warmup
-            keypoint_result = keypoint_detector.predict(
-                batch_images, keypoint_threshold, repeats=10, add_timer=False)
-            # run benchmark
-            keypoint_result = keypoint_detector.predict(
-                batch_images, keypoint_threshold, repeats=10, add_timer=True)
-        else:
-            keypoint_result = keypoint_detector.predict(batch_images,
-                                                        keypoint_threshold)
-        orgkeypoints, scores = translate_to_ori_images(keypoint_result,
-                                                       batch_records)
-        keypoint_vector.append(orgkeypoints)
-        score_vector.append(scores)
 
+    rect_vector = det_rects
+    keypoint_results = keypoint_detector.predict_image(
+        rec_images, run_benchmark, repeats=10, visual=False)
+    keypoint_vector, score_vector = translate_to_ori_images(keypoint_results,
+                                                            np.array(records))
     keypoint_res = {}
     keypoint_res['keypoint'] = [
-        np.vstack(keypoint_vector).tolist(), np.vstack(score_vector).tolist()
+        keypoint_vector.tolist(), score_vector.tolist()
     ] if len(keypoint_vector) > 0 else [[], []]
     keypoint_res['bbox'] = rect_vector
     return keypoint_res
@@ -104,18 +70,15 @@ def topdown_unite_predict(detector,
         det_timer.preprocess_time_s.end()
 
         if FLAGS.run_benchmark:
-            # warmup
-            results = detector.predict(
-                [image], FLAGS.det_threshold, repeats=10, add_timer=False)
-            # run benchmark
-            results = detector.predict(
-                [image], FLAGS.det_threshold, repeats=10, add_timer=True)
+            results = detector.predict_image(
+                [image], run_benchmark=True, repeats=10)
+
             cm, gm, gu = get_current_memory_mb()
             detector.cpu_mem += cm
             detector.gpu_mem += gm
             detector.gpu_util += gu
         else:
-            results = detector.predict([image], FLAGS.det_threshold)
+            results = detector.predict_image([image], visual=False)
 
         if results['boxes_num'] == 0:
             continue
@@ -137,10 +100,10 @@ def topdown_unite_predict(detector,
         else:
             if not os.path.exists(FLAGS.output_dir):
                 os.makedirs(FLAGS.output_dir)
-            draw_pose(
+            visualize_pose(
                 img_file,
                 keypoint_res,
-                visual_thread=FLAGS.keypoint_threshold,
+                visual_thresh=FLAGS.keypoint_threshold,
                 save_dir=FLAGS.output_dir)
     if save_res:
         """
@@ -164,8 +127,7 @@ def topdown_unite_predict_video(detector,
         capture = cv2.VideoCapture(camera_id)
     else:
         capture = cv2.VideoCapture(FLAGS.video_file)
-        video_name = os.path.splitext(os.path.basename(FLAGS.video_file))[
-            0] + '.mp4'
+        video_name = os.path.split(FLAGS.video_file)[-1]
     # Get Video info : resolution, fps, frame count
     width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
@@ -176,7 +138,7 @@ def topdown_unite_predict_video(detector,
     if not os.path.exists(FLAGS.output_dir):
         os.makedirs(FLAGS.output_dir)
     out_path = os.path.join(FLAGS.output_dir, video_name)
-    fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
     index = 0
     store_res = []
@@ -188,16 +150,17 @@ def topdown_unite_predict_video(detector,
         print('detect frame: %d' % (index))
 
         frame2 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        results = detector.predict([frame2], FLAGS.det_threshold)
+
+        results = detector.predict_image([frame2], visual=False)
 
         keypoint_res = predict_with_given_det(
             frame2, results, topdown_keypoint_detector, keypoint_batch_size,
             FLAGS.det_threshold, FLAGS.keypoint_threshold, FLAGS.run_benchmark)
 
-        im = draw_pose(
+        im = visualize_pose(
             frame,
             keypoint_res,
-            visual_thread=FLAGS.keypoint_threshold,
+            visual_thresh=FLAGS.keypoint_threshold,
             returnimg=True)
         if save_res:
             store_res.append([
@@ -211,6 +174,7 @@ def topdown_unite_predict_video(detector,
             if cv2.waitKey(1) & 0xFF == ord('q'):
                 break
     writer.release()
+    print('output_video saved to: {}'.format(out_path))
     if save_res:
         """
         1) store_res: a list of frame_data
@@ -224,13 +188,15 @@ def topdown_unite_predict_video(detector,
 
 
 def main():
-    pred_config = PredictConfig(FLAGS.det_model_dir)
+    deploy_file = os.path.join(FLAGS.det_model_dir, 'infer_cfg.yml')
+    with open(deploy_file) as f:
+        yml_conf = yaml.safe_load(f)
+    arch = yml_conf['arch']
     detector_func = 'Detector'
-    if pred_config.arch == 'PicoDet':
+    if arch == 'PicoDet':
         detector_func = 'DetectorPicoDet'
 
-    detector = eval(detector_func)(pred_config,
-                                   FLAGS.det_model_dir,
+    detector = eval(detector_func)(FLAGS.det_model_dir,
                                    device=FLAGS.device,
                                    run_mode=FLAGS.run_mode,
                                    trt_min_shape=FLAGS.trt_min_shape,
@@ -238,14 +204,10 @@ def main():
                                    trt_opt_shape=FLAGS.trt_opt_shape,
                                    trt_calib_mode=FLAGS.trt_calib_mode,
                                    cpu_threads=FLAGS.cpu_threads,
-                                   enable_mkldnn=FLAGS.enable_mkldnn)
+                                   enable_mkldnn=FLAGS.enable_mkldnn,
+                                   threshold=FLAGS.det_threshold)
 
-    pred_config = PredictConfig_KeyPoint(FLAGS.keypoint_model_dir)
-    assert KEYPOINT_SUPPORT_MODELS[
-        pred_config.
-        arch] == 'keypoint_topdown', 'Detection-Keypoint unite inference only supports topdown models.'
-    topdown_keypoint_detector = KeyPoint_Detector(
-        pred_config,
+    topdown_keypoint_detector = KeyPointDetector(
         FLAGS.keypoint_model_dir,
         device=FLAGS.device,
         run_mode=FLAGS.run_mode,
@@ -257,6 +219,9 @@ def main():
         cpu_threads=FLAGS.cpu_threads,
         enable_mkldnn=FLAGS.enable_mkldnn,
         use_dark=FLAGS.use_dark)
+    keypoint_arch = topdown_keypoint_detector.pred_config.arch
+    assert KEYPOINT_SUPPORT_MODELS[
+        keypoint_arch] == 'keypoint_topdown', 'Detection-Keypoint unite inference only supports topdown models.'
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:
diff --git a/deploy/python/infer.py b/deploy/python/infer.py
index 17e70de68..9b3ec009b 100644
--- a/deploy/python/infer.py
+++ b/deploy/python/infer.py
@@ -24,9 +24,15 @@ import paddle
 from paddle.inference import Config
 from paddle.inference import create_predictor
 
+import sys
+# add deploy path of PadleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'])))
+sys.path.insert(0, parent_path)
+
 from benchmark_utils import PaddleInferBenchmark
 from picodet_postprocess import PicoDetPostProcess
 from preprocess import preprocess, Resize, NormalizeImage, Permute, PadStride, LetterBoxResize, WarpAffine
+from keypoint_preprocess import EvalAffine, TopDownEvalAffine, expand_crop
 from visualize import visualize_box_mask
 from utils import argsparser, Timer, get_current_memory_mb
 
@@ -47,9 +53,27 @@ SUPPORT_MODELS = {
     'PicoDet',
     'CenterNet',
     'TOOD',
+    'StrongBaseline',
 }
 
 
+def bench_log(detector, img_list, model_info, batch_size=1, name=None):
+    mems = {
+        'cpu_rss_mb': detector.cpu_mem / len(img_list),
+        'gpu_rss_mb': detector.gpu_mem / len(img_list),
+        'gpu_util': detector.gpu_util * 100 / len(img_list)
+    }
+    perf_info = detector.det_times.report(average=True)
+    data_info = {
+        'batch_size': batch_size,
+        'shape': "dynamic_shape",
+        'data_num': perf_info['img_num']
+    }
+    log = PaddleInferBenchmark(detector.config, model_info, data_info,
+                               perf_info, mems)
+    log(name)
+
+
 class Detector(object):
     """
     Args:
@@ -65,21 +89,25 @@ class Detector(object):
             calibration, trt_calib_mode need to set True
         cpu_threads (int): cpu threads
         enable_mkldnn (bool): whether to open MKLDNN
+        output_dir (str): The path of output
+        threshold (float): The threshold of score for visualization
     """
 
-    def __init__(self,
-                 pred_config,
-                 model_dir,
-                 device='CPU',
-                 run_mode='paddle',
-                 batch_size=1,
-                 trt_min_shape=1,
-                 trt_max_shape=1280,
-                 trt_opt_shape=640,
-                 trt_calib_mode=False,
-                 cpu_threads=1,
-                 enable_mkldnn=False):
-        self.pred_config = pred_config
+    def __init__(
+            self,
+            model_dir,
+            device='CPU',
+            run_mode='paddle',
+            batch_size=1,
+            trt_min_shape=1,
+            trt_max_shape=1280,
+            trt_opt_shape=640,
+            trt_calib_mode=False,
+            cpu_threads=1,
+            enable_mkldnn=False,
+            output_dir='output',
+            threshold=0.5, ):
+        self.pred_config = self.set_config(model_dir)
         self.predictor, self.config = load_predictor(
             model_dir,
             run_mode=run_mode,
@@ -95,6 +123,12 @@ class Detector(object):
             enable_mkldnn=enable_mkldnn)
         self.det_times = Timer()
         self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
+        self.batch_size = batch_size
+        self.output_dir = output_dir
+        self.threshold = threshold
+
+    def set_config(self, model_dir):
+        return PredictConfig(model_dir)
 
     def preprocess(self, image_list):
         preprocess_ops = []
@@ -110,49 +144,34 @@ class Detector(object):
             input_im_lst.append(im)
             input_im_info_lst.append(im_info)
         inputs = create_inputs(input_im_lst, input_im_info_lst)
+        input_names = self.predictor.get_input_names()
+        for i in range(len(input_names)):
+            input_tensor = self.predictor.get_input_handle(input_names[i])
+            input_tensor.copy_from_cpu(inputs[input_names[i]])
+
         return inputs
 
-    def postprocess(self,
-                    np_boxes,
-                    np_masks,
-                    inputs,
-                    np_boxes_num,
-                    threshold=0.5):
+    def postprocess(self, inputs, result):
         # postprocess output of predictor
-        results = {}
-        results['boxes'] = np_boxes
-        results['boxes_num'] = np_boxes_num
-        if np_masks is not None:
-            results['masks'] = np_masks
-        return results
+        np_boxes_num = result['boxes_num']
+        if np_boxes_num[0] <= 0:
+            print('[WARNNING] No object detected.')
+            result = {'boxes': np.zeros([0, 6]), 'boxes_num': [0]}
+        result = {k: v for k, v in result.items() if v is not None}
+        return result
 
-    def predict(self, image_list, threshold=0.5, repeats=1, add_timer=True):
+    def predict(self, repeats=1):
         '''
         Args:
-            image_list (list): list of image
-            threshold (float): threshold of predicted box' score
-            repeats (int): repeat number for prediction
-            add_timer (bool): whether add timer during prediction
+            repeats (int): repeats number for prediction
         Returns:
-            results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
+            result (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
                             matix element:[class, score, x_min, y_min, x_max, y_max]
-                            MaskRCNN's results include 'masks': np.ndarray:
+                            MaskRCNN's result include 'masks': np.ndarray:
                             shape: [N, im_h, im_w]
         '''
-        # preprocess
-        if add_timer:
-            self.det_times.preprocess_time_s.start()
-        inputs = self.preprocess(image_list)
-        np_boxes, np_masks = None, None
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        if add_timer:
-            self.det_times.preprocess_time_s.end()
-            self.det_times.inference_time_s.start()
-
         # model prediction
+        np_boxes, np_masks = None, None
         for i in range(repeats):
             self.predictor.run()
             output_names = self.predictor.get_output_names()
@@ -163,32 +182,136 @@ class Detector(object):
             if self.pred_config.mask:
                 masks_tensor = self.predictor.get_output_handle(output_names[2])
                 np_masks = masks_tensor.copy_to_cpu()
+        result = dict(boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
+        return result
+
+    def merge_batch_result(self, batch_result):
+        if len(batch_result) == 1:
+            return batch_result[0]
+        res_key = batch_result[0].keys()
+        results = {k: [] for k in res_key}
+        for res in batch_result:
+            for k, v in res.items():
+                results[k].append(v)
+        for k, v in results.items():
+            results[k] = np.concatenate(v)
+        return results
 
-        if add_timer:
-            self.det_times.inference_time_s.end(repeats=repeats)
-            self.det_times.postprocess_time_s.start()
+    def get_timer(self):
+        return self.det_times
 
-        # postprocess
+    def predict_image(self,
+                      image_list,
+                      run_benchmark=False,
+                      repeats=1,
+                      visual=True):
+        batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
         results = []
-        if reduce(lambda x, y: x * y, np_boxes.shape) < 6:
-            print('[WARNNING] No object detected.')
-            results = {'boxes': np.zeros([0, 6]), 'boxes_num': [0]}
-        else:
-            results = self.postprocess(
-                np_boxes, np_masks, inputs, np_boxes_num, threshold=threshold)
-        if add_timer:
-            self.det_times.postprocess_time_s.end()
-            self.det_times.img_num += len(image_list)
+        for i in range(batch_loop_cnt):
+            start_index = i * self.batch_size
+            end_index = min((i + 1) * self.batch_size, len(image_list))
+            batch_image_list = image_list[start_index:end_index]
+            if run_benchmark:
+                # preprocess
+                inputs = self.preprocess(batch_image_list)  # warmup
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                result = self.predict(repeats=repeats)  # warmup
+                self.det_times.inference_time_s.start()
+                result = self.predict(repeats=repeats)
+                self.det_times.inference_time_s.end(repeats=repeats)
+
+                # postprocess
+                result_warmup = self.postprocess(inputs, result)  # warmup
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += len(batch_image_list)
+
+                cm, gm, gu = get_current_memory_mb()
+                self.cpu_mem += cm
+                self.gpu_mem += gm
+                self.gpu_util += gu
+            else:
+                # preprocess
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                self.det_times.inference_time_s.start()
+                result = self.predict()
+                self.det_times.inference_time_s.end()
+
+                # postprocess
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += len(batch_image_list)
+
+                if visual:
+                    visualize(
+                        batch_image_list,
+                        result,
+                        self.pred_config.labels,
+                        output_dir=self.output_dir,
+                        threshold=self.threshold)
+
+            results.append(result)
+            if visual:
+                print('Test iter {}'.format(i))
+
+        results = self.merge_batch_result(results)
         return results
 
-    def get_timer(self):
-        return self.det_times
+    def predict_video(self, video_file, camera_id):
+        video_out_name = 'output.mp4'
+        if camera_id != -1:
+            capture = cv2.VideoCapture(camera_id)
+        else:
+            capture = cv2.VideoCapture(video_file)
+            video_out_name = os.path.split(video_file)[-1]
+        # Get Video info : resolution, fps, frame count
+        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = int(capture.get(cv2.CAP_PROP_FPS))
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        print("fps: %d, frame_count: %d" % (fps, frame_count))
+
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+        out_path = os.path.join(self.output_dir, video_out_name)
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+        index = 1
+        while (1):
+            ret, frame = capture.read()
+            if not ret:
+                break
+            print('detect frame: %d' % (index))
+            index += 1
+            results = self.predict_image([frame], visual=False)
+
+            im = visualize_box_mask(
+                frame,
+                results,
+                self.pred_config.labels,
+                threshold=self.threshold)
+            im = np.array(im)
+            writer.write(im)
+            if camera_id != -1:
+                cv2.imshow('Mask Detection', im)
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
+        writer.release()
 
 
 class DetectorSOLOv2(Detector):
     """
     Args:
-        config (object): config of model, defined by `Config(model_dir)`
         model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
         device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
         run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
@@ -200,61 +323,49 @@ class DetectorSOLOv2(Detector):
             calibration, trt_calib_mode need to set True
         cpu_threads (int): cpu threads
         enable_mkldnn (bool): whether to open MKLDNN 
+        output_dir (str): The path of output
+        threshold (float): The threshold of score for visualization
+       
     """
 
-    def __init__(self,
-                 pred_config,
-                 model_dir,
-                 device='CPU',
-                 run_mode='paddle',
-                 batch_size=1,
-                 trt_min_shape=1,
-                 trt_max_shape=1280,
-                 trt_opt_shape=640,
-                 trt_calib_mode=False,
-                 cpu_threads=1,
-                 enable_mkldnn=False):
-        self.pred_config = pred_config
-        self.predictor, self.config = load_predictor(
+    def __init__(
+            self,
             model_dir,
+            device='CPU',
+            run_mode='paddle',
+            batch_size=1,
+            trt_min_shape=1,
+            trt_max_shape=1280,
+            trt_opt_shape=640,
+            trt_calib_mode=False,
+            cpu_threads=1,
+            enable_mkldnn=False,
+            output_dir='./',
+            threshold=0.5, ):
+        super(DetectorSOLOv2, self).__init__(
+            model_dir=model_dir,
+            device=device,
             run_mode=run_mode,
             batch_size=batch_size,
-            min_subgraph_size=self.pred_config.min_subgraph_size,
-            device=device,
-            use_dynamic_shape=self.pred_config.use_dynamic_shape,
             trt_min_shape=trt_min_shape,
             trt_max_shape=trt_max_shape,
             trt_opt_shape=trt_opt_shape,
             trt_calib_mode=trt_calib_mode,
             cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn)
-        self.det_times = Timer()
-        self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
+            enable_mkldnn=enable_mkldnn,
+            output_dir=output_dir,
+            threshold=threshold, )
 
-    def predict(self, image, threshold=0.5, repeats=1, add_timer=True):
+    def predict(self, repeats=1):
         '''
         Args:
-            image (str/np.ndarray): path of image/ np.ndarray read by cv2
-            threshold (float): threshold of predicted box' score
             repeats (int): repeat number for prediction
-            add_timer (bool): whether add timer during prediction
         Returns:
-            results (dict): 'segm': np.ndarray,shape:[N, im_h, im_w]
+            result (dict): 'segm': np.ndarray,shape:[N, im_h, im_w]
                             'cate_label': label of segm, shape:[N]
                             'cate_score': confidence score of segm, shape:[N]
         '''
-        # preprocess
-        if add_timer:
-            self.det_times.preprocess_time_s.start()
-        inputs = self.preprocess(image)
         np_label, np_score, np_segms = None, None, None
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        if add_timer:
-            self.det_times.preprocess_time_s.end()
-            self.det_times.inference_time_s.start()
         for i in range(repeats):
             self.predictor.run()
             output_names = self.predictor.get_output_names()
@@ -266,21 +377,18 @@ class DetectorSOLOv2(Detector):
                 2]).copy_to_cpu()
             np_segms = self.predictor.get_output_handle(output_names[
                 3]).copy_to_cpu()
-        if add_timer:
-            self.det_times.inference_time_s.end(repeats=repeats)
-            self.det_times.img_num += 1
 
-        return dict(
+        result = dict(
             segm=np_segms,
             label=np_label,
             score=np_score,
             boxes_num=np_boxes_num)
+        return result
 
 
 class DetectorPicoDet(Detector):
     """
     Args:
-        config (object): config of model, defined by `Config(model_dir)`
         model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
         device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
         run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
@@ -294,61 +402,57 @@ class DetectorPicoDet(Detector):
         enable_mkldnn (bool): whether to open MKLDNN 
     """
 
-    def __init__(self,
-                 pred_config,
-                 model_dir,
-                 device='CPU',
-                 run_mode='paddle',
-                 batch_size=1,
-                 trt_min_shape=1,
-                 trt_max_shape=1280,
-                 trt_opt_shape=640,
-                 trt_calib_mode=False,
-                 cpu_threads=1,
-                 enable_mkldnn=False):
-        self.pred_config = pred_config
-        self.predictor, self.config = load_predictor(
+    def __init__(
+            self,
             model_dir,
+            device='CPU',
+            run_mode='paddle',
+            batch_size=1,
+            trt_min_shape=1,
+            trt_max_shape=1280,
+            trt_opt_shape=640,
+            trt_calib_mode=False,
+            cpu_threads=1,
+            enable_mkldnn=False,
+            output_dir='./',
+            threshold=0.5, ):
+        super(DetectorPicoDet, self).__init__(
+            model_dir=model_dir,
+            device=device,
             run_mode=run_mode,
             batch_size=batch_size,
-            min_subgraph_size=self.pred_config.min_subgraph_size,
-            device=device,
-            use_dynamic_shape=self.pred_config.use_dynamic_shape,
             trt_min_shape=trt_min_shape,
             trt_max_shape=trt_max_shape,
             trt_opt_shape=trt_opt_shape,
             trt_calib_mode=trt_calib_mode,
             cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn)
-        self.det_times = Timer()
-        self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
+            enable_mkldnn=enable_mkldnn,
+            output_dir=output_dir,
+            threshold=threshold, )
+
+    def postprocess(self, inputs, result):
+        # postprocess output of predictor
+        np_score_list = result['boxes']
+        np_boxes_list = result['boxes_num']
+        postprocessor = PicoDetPostProcess(
+            inputs['image'].shape[2:],
+            inputs['im_shape'],
+            inputs['scale_factor'],
+            strides=self.pred_config.fpn_stride,
+            nms_threshold=self.pred_config.nms['nms_threshold'])
+        np_boxes, np_boxes_num = postprocessor(np_score_list, np_boxes_list)
+        result = dict(boxes=np_boxes, boxes_num=np_boxes_num)
+        return result
 
-    def predict(self, image, threshold=0.5, repeats=1, add_timer=True):
+    def predict(self, repeats=1):
         '''
         Args:
-            image (str/np.ndarray): path of image/ np.ndarray read by cv2
-            threshold (float): threshold of predicted box' score
             repeats (int): repeat number for prediction
-            add_timer (bool): whether add timer during prediction
         Returns:
-            results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
+            result (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
                             matix element:[class, score, x_min, y_min, x_max, y_max]
         '''
-        # preprocess
-        if add_timer:
-            self.det_times.preprocess_time_s.start()
-        inputs = self.preprocess(image)
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(inputs[input_names[i]])
-
         np_score_list, np_boxes_list = [], []
-        if add_timer:
-            self.det_times.preprocess_time_s.end()
-            self.det_times.inference_time_s.start()
-
-        # model_prediction
         for i in range(repeats):
             self.predictor.run()
             np_score_list.clear()
@@ -362,22 +466,8 @@ class DetectorPicoDet(Detector):
                 np_boxes_list.append(
                     self.predictor.get_output_handle(output_names[
                         out_idx + num_outs]).copy_to_cpu())
-        if add_timer:
-            self.det_times.inference_time_s.end(repeats=repeats)
-            self.det_times.img_num += 1
-            self.det_times.postprocess_time_s.start()
-
-        # postprocess
-        self.postprocess = PicoDetPostProcess(
-            inputs['image'].shape[2:],
-            inputs['im_shape'],
-            inputs['scale_factor'],
-            strides=self.pred_config.fpn_stride,
-            nms_threshold=self.pred_config.nms['nms_threshold'])
-        np_boxes, np_boxes_num = self.postprocess(np_score_list, np_boxes_list)
-        if add_timer:
-            self.det_times.postprocess_time_s.end()
-        return dict(boxes=np_boxes, boxes_num=np_boxes_num)
+        result = dict(boxes=np_score_list, boxes_num=np_boxes_list)
+        return result
 
 
 def create_inputs(imgs, im_info):
@@ -596,27 +686,27 @@ def get_test_images(infer_dir, infer_img):
     return images
 
 
-def visualize(image_list, results, labels, output_dir='output/', threshold=0.5):
+def visualize(image_list, result, labels, output_dir='output/', threshold=0.5):
     # visualize the predict result
     start_idx = 0
     for idx, image_file in enumerate(image_list):
-        im_bboxes_num = results['boxes_num'][idx]
+        im_bboxes_num = result['boxes_num'][idx]
         im_results = {}
-        if 'boxes' in results:
-            im_results['boxes'] = results['boxes'][start_idx:start_idx +
-                                                   im_bboxes_num, :]
-        if 'masks' in results:
-            im_results['masks'] = results['masks'][start_idx:start_idx +
-                                                   im_bboxes_num, :]
-        if 'segm' in results:
-            im_results['segm'] = results['segm'][start_idx:start_idx +
-                                                 im_bboxes_num, :]
-        if 'label' in results:
-            im_results['label'] = results['label'][start_idx:start_idx +
-                                                   im_bboxes_num]
-        if 'score' in results:
-            im_results['score'] = results['score'][start_idx:start_idx +
-                                                   im_bboxes_num]
+        if 'boxes' in result:
+            im_results['boxes'] = result['boxes'][start_idx:start_idx +
+                                                  im_bboxes_num, :]
+        if 'masks' in result:
+            im_results['masks'] = result['masks'][start_idx:start_idx +
+                                                  im_bboxes_num, :]
+        if 'segm' in result:
+            im_results['segm'] = result['segm'][start_idx:start_idx +
+                                                im_bboxes_num, :]
+        if 'label' in result:
+            im_results['label'] = result['label'][start_idx:start_idx +
+                                                  im_bboxes_num]
+        if 'score' in result:
+            im_results['score'] = result['score'][start_idx:start_idx +
+                                                  im_bboxes_num]
 
         start_idx += im_bboxes_num
         im = visualize_box_mask(
@@ -636,86 +726,18 @@ def print_arguments(args):
     print('------------------------------------------')
 
 
-def predict_image(detector, image_list, batch_size=1):
-    batch_loop_cnt = math.ceil(float(len(image_list)) / batch_size)
-    for i in range(batch_loop_cnt):
-        start_index = i * batch_size
-        end_index = min((i + 1) * batch_size, len(image_list))
-        batch_image_list = image_list[start_index:end_index]
-        if FLAGS.run_benchmark:
-            # warmup
-            detector.predict(
-                batch_image_list, FLAGS.threshold, repeats=10, add_timer=False)
-            # run benchmark
-            detector.predict(
-                batch_image_list, FLAGS.threshold, repeats=10, add_timer=True)
-
-            cm, gm, gu = get_current_memory_mb()
-            detector.cpu_mem += cm
-            detector.gpu_mem += gm
-            detector.gpu_util += gu
-            print('Test iter {}'.format(i))
-        else:
-            results = detector.predict(batch_image_list, FLAGS.threshold)
-            visualize(
-                batch_image_list,
-                results,
-                detector.pred_config.labels,
-                output_dir=FLAGS.output_dir,
-                threshold=FLAGS.threshold)
-
-
-def predict_video(detector, camera_id):
-    video_out_name = 'output.mp4'
-    if camera_id != -1:
-        capture = cv2.VideoCapture(camera_id)
-    else:
-        capture = cv2.VideoCapture(FLAGS.video_file)
-        video_out_name = os.path.split(FLAGS.video_file)[-1]
-    # Get Video info : resolution, fps, frame count
-    width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = int(capture.get(cv2.CAP_PROP_FPS))
-    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
-    print("fps: %d, frame_count: %d" % (fps, frame_count))
-
-    if not os.path.exists(FLAGS.output_dir):
-        os.makedirs(FLAGS.output_dir)
-    out_path = os.path.join(FLAGS.output_dir, video_out_name)
-    fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
-    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
-    index = 1
-    while (1):
-        ret, frame = capture.read()
-        if not ret:
-            break
-        print('detect frame: %d' % (index))
-        index += 1
-        results = detector.predict([frame], FLAGS.threshold)
-        im = visualize_box_mask(
-            frame,
-            results,
-            detector.pred_config.labels,
-            threshold=FLAGS.threshold)
-        im = np.array(im)
-        writer.write(im)
-        if camera_id != -1:
-            cv2.imshow('Mask Detection', im)
-            if cv2.waitKey(1) & 0xFF == ord('q'):
-                break
-    writer.release()
-
-
 def main():
-    pred_config = PredictConfig(FLAGS.model_dir)
+    deploy_file = os.path.join(FLAGS.model_dir, 'infer_cfg.yml')
+    with open(deploy_file) as f:
+        yml_conf = yaml.safe_load(f)
+    arch = yml_conf['arch']
     detector_func = 'Detector'
-    if pred_config.arch == 'SOLOv2':
+    if arch == 'SOLOv2':
         detector_func = 'DetectorSOLOv2'
-    elif pred_config.arch == 'PicoDet':
+    elif arch == 'PicoDet':
         detector_func = 'DetectorPicoDet'
 
-    detector = eval(detector_func)(pred_config,
-                                   FLAGS.model_dir,
+    detector = eval(detector_func)(FLAGS.model_dir,
                                    device=FLAGS.device,
                                    run_mode=FLAGS.run_mode,
                                    batch_size=FLAGS.batch_size,
@@ -724,41 +746,29 @@ def main():
                                    trt_opt_shape=FLAGS.trt_opt_shape,
                                    trt_calib_mode=FLAGS.trt_calib_mode,
                                    cpu_threads=FLAGS.cpu_threads,
-                                   enable_mkldnn=FLAGS.enable_mkldnn)
+                                   enable_mkldnn=FLAGS.enable_mkldnn,
+                                   threshold=FLAGS.threshold,
+                                   output_dir=FLAGS.output_dir)
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:
-        predict_video(detector, FLAGS.camera_id)
+        detector.predict_video(FLAGS.video_file, FLAGS.camera_id)
     else:
         # predict from image
         if FLAGS.image_dir is None and FLAGS.image_file is not None:
             assert FLAGS.batch_size == 1, "batch_size should be 1, when image_file is not None"
         img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        predict_image(detector, img_list, FLAGS.batch_size)
+        detector.predict_image(img_list, FLAGS.run_benchmark, repeats=10)
         if not FLAGS.run_benchmark:
             detector.det_times.info(average=True)
         else:
-            mems = {
-                'cpu_rss_mb': detector.cpu_mem / len(img_list),
-                'gpu_rss_mb': detector.gpu_mem / len(img_list),
-                'gpu_util': detector.gpu_util * 100 / len(img_list)
-            }
-
-            perf_info = detector.det_times.report(average=True)
-            model_dir = FLAGS.model_dir
             mode = FLAGS.run_mode
+            model_dir = FLAGS.model_dir
             model_info = {
                 'model_name': model_dir.strip('/').split('/')[-1],
                 'precision': mode.split('_')[-1]
             }
-            data_info = {
-                'batch_size': FLAGS.batch_size,
-                'shape': "dynamic_shape",
-                'data_num': perf_info['img_num']
-            }
-            det_log = PaddleInferBenchmark(detector.config, model_info,
-                                           data_info, perf_info, mems)
-            det_log('Det')
+            bench_log(detector, img_list, model_info, name='DET')
 
 
 if __name__ == '__main__':
diff --git a/deploy/python/keypoint_infer.py b/deploy/python/keypoint_infer.py
index c983ff772..ec36a1930 100644
--- a/deploy/python/keypoint_infer.py
+++ b/deploy/python/keypoint_infer.py
@@ -23,10 +23,16 @@ import cv2
 import math
 import numpy as np
 import paddle
+
+import sys
+# add deploy path of PadleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'])))
+sys.path.insert(0, parent_path)
+
 from preprocess import preprocess, NormalizeImage, Permute
 from keypoint_preprocess import EvalAffine, TopDownEvalAffine, expand_crop
 from keypoint_postprocess import HrHRNetPostProcess, HRNetPostProcess
-from visualize import draw_pose
+from visualize import visualize_pose
 from paddle.inference import Config
 from paddle.inference import create_predictor
 from utils import argsparser, Timer, get_current_memory_mb
@@ -40,13 +46,13 @@ KEYPOINT_SUPPORT_MODELS = {
 }
 
 
-class KeyPoint_Detector(Detector):
+class KeyPointDetector(Detector):
     """
     Args:
-        config (object): config of model, defined by `Config(model_dir)`
         model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
         device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
         run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
+        batch_size (int): size of pre batch in inference
         trt_min_shape (int): min shape for dynamic shape in trt
         trt_max_shape (int): max shape for dynamic shape in trt
         trt_opt_shape (int): opt shape for dynamic shape in trt
@@ -58,7 +64,6 @@ class KeyPoint_Detector(Detector):
     """
 
     def __init__(self,
-                 pred_config,
                  model_dir,
                  device='CPU',
                  run_mode='paddle',
@@ -69,9 +74,10 @@ class KeyPoint_Detector(Detector):
                  trt_calib_mode=False,
                  cpu_threads=1,
                  enable_mkldnn=False,
+                 output_dir='output',
+                 threshold=0.5,
                  use_dark=True):
-        super(KeyPoint_Detector, self).__init__(
-            pred_config=pred_config,
+        super(KeyPointDetector, self).__init__(
             model_dir=model_dir,
             device=device,
             run_mode=run_mode,
@@ -81,9 +87,14 @@ class KeyPoint_Detector(Detector):
             trt_opt_shape=trt_opt_shape,
             trt_calib_mode=trt_calib_mode,
             cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn)
+            enable_mkldnn=enable_mkldnn,
+            output_dir=output_dir,
+            threshold=threshold, )
         self.use_dark = use_dark
 
+    def set_config(self, model_dir):
+        return PredictConfig_KeyPoint(model_dir)
+
     def get_person_from_rect(self, image, results, det_threshold=0.5):
         # crop the person result from image
         self.det_times.preprocess_time_s.start()
@@ -103,34 +114,22 @@ class KeyPoint_Detector(Detector):
         self.det_times.preprocess_time_s.end()
         return rect_images, new_rects, org_rects
 
-    def preprocess(self, image_list):
-        preprocess_ops = []
-        for op_info in self.pred_config.preprocess_infos:
-            new_op_info = op_info.copy()
-            op_type = new_op_info.pop('type')
-            preprocess_ops.append(eval(op_type)(**new_op_info))
-
-        input_im_lst = []
-        input_im_info_lst = []
-        for im in image_list:
-            im, im_info = preprocess(im, preprocess_ops)
-            input_im_lst.append(im)
-            input_im_info_lst.append(im_info)
-        inputs = create_inputs(input_im_lst, input_im_info_lst)
-        return inputs
-
-    def postprocess(self, np_boxes, np_masks, inputs, threshold=0.5):
+    def postprocess(self, inputs, result):
+        np_heatmap = result['heatmap']
+        np_masks = result['masks']
         # postprocess output of predictor
         if KEYPOINT_SUPPORT_MODELS[
                 self.pred_config.arch] == 'keypoint_bottomup':
             results = {}
             h, w = inputs['im_shape'][0]
-            preds = [np_boxes]
+            preds = [np_heatmap]
             if np_masks is not None:
                 preds += np_masks
             preds += [h, w]
             keypoint_postprocess = HrHRNetPostProcess()
-            results['keypoint'] = keypoint_postprocess(*preds)
+            kpts, scores = keypoint_postprocess(*preds)
+            results['keypoint'] = kpts
+            results['score'] = scores
             return results
         elif KEYPOINT_SUPPORT_MODELS[
                 self.pred_config.arch] == 'keypoint_topdown':
@@ -139,44 +138,31 @@ class KeyPoint_Detector(Detector):
             center = np.round(imshape / 2.)
             scale = imshape / 200.
             keypoint_postprocess = HRNetPostProcess(use_dark=self.use_dark)
-            results['keypoint'] = keypoint_postprocess(np_boxes, center, scale)
+            kpts, scores = keypoint_postprocess(np_heatmap, center, scale)
+            results['keypoint'] = kpts
+            results['score'] = scores
             return results
         else:
             raise ValueError("Unsupported arch: {}, expect {}".format(
                 self.pred_config.arch, KEYPOINT_SUPPORT_MODELS))
 
-    def predict(self, image_list, threshold=0.5, repeats=1, add_timer=True):
+    def predict(self, repeats=1):
         '''
         Args:
-            image_list (list): list of image 
-            threshold (float): threshold of predicted box' score
             repeats (int): repeat number for prediction
-            add_timer (bool): whether add timer during prediction
         Returns:
             results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
                             matix element:[class, score, x_min, y_min, x_max, y_max]
                             MaskRCNN's results include 'masks': np.ndarray:
                             shape: [N, im_h, im_w]
         '''
-        # preprocess
-        if add_timer:
-            self.det_times.preprocess_time_s.start()
-        inputs = self.preprocess(image_list)
-        np_boxes, np_masks = None, None
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        if add_timer:
-            self.det_times.preprocess_time_s.end()
-            self.det_times.inference_time_s.start()
-
         # model prediction
+        np_heatmap, np_masks = None, None
         for i in range(repeats):
             self.predictor.run()
             output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            np_boxes = boxes_tensor.copy_to_cpu()
+            heatmap_tensor = self.predictor.get_output_handle(output_names[0])
+            np_heatmap = heatmap_tensor.copy_to_cpu()
             if self.pred_config.tagmap:
                 masks_tensor = self.predictor.get_output_handle(output_names[1])
                 heat_k = self.predictor.get_output_handle(output_names[2])
@@ -185,18 +171,113 @@ class KeyPoint_Detector(Detector):
                     masks_tensor.copy_to_cpu(), heat_k.copy_to_cpu(),
                     inds_k.copy_to_cpu()
                 ]
-        if add_timer:
-            self.det_times.inference_time_s.end(repeats=repeats)
-            self.det_times.postprocess_time_s.start()
-
-        # postprocess
-        results = self.postprocess(
-            np_boxes, np_masks, inputs, threshold=threshold)
-        if add_timer:
-            self.det_times.postprocess_time_s.end()
-            self.det_times.img_num += len(image_list)
+        result = dict(heatmap=np_heatmap, masks=np_masks)
+        return result
+
+    def predict_image(self,
+                      image_list,
+                      run_benchmark=False,
+                      repeats=1,
+                      visual=True):
+        results = []
+        batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
+        for i in range(batch_loop_cnt):
+            start_index = i * self.batch_size
+            end_index = min((i + 1) * self.batch_size, len(image_list))
+            batch_image_list = image_list[start_index:end_index]
+            if run_benchmark:
+                # preprocess
+                inputs = self.preprocess(batch_image_list)  # warmup
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                result_warmup = self.predict(repeats=repeats)  # warmup
+                self.det_times.inference_time_s.start()
+                result = self.predict(repeats=repeats)
+                self.det_times.inference_time_s.end(repeats=repeats)
+
+                # postprocess
+                result_warmup = self.postprocess(inputs, result)  # warmup
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += len(batch_image_list)
+
+                cm, gm, gu = get_current_memory_mb()
+                self.cpu_mem += cm
+                self.gpu_mem += gm
+                self.gpu_util += gu
+
+            else:
+                # preprocess
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                self.det_times.inference_time_s.start()
+                result = self.predict()
+                self.det_times.inference_time_s.end()
+
+                # postprocess
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += len(batch_image_list)
+
+                if visual:
+                    if not os.path.exists(self.output_dir):
+                        os.makedirs(self.output_dir)
+                    visualize(
+                        batch_image_list,
+                        result,
+                        visual_thresh=self.threshold,
+                        save_dir=self.output_dir)
+
+            results.append(result)
+            if visual:
+                print('Test iter {}'.format(i))
+        results = self.merge_batch_result(results)
         return results
 
+    def predict_video(self, video_file, camera_id):
+        video_name = 'output.mp4'
+        if camera_id != -1:
+            capture = cv2.VideoCapture(camera_id)
+        else:
+            capture = cv2.VideoCapture(video_file)
+            video_name = os.path.split(video_file)[-1]
+        # Get Video info : resolution, fps, frame count
+        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = int(capture.get(cv2.CAP_PROP_FPS))
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        print("fps: %d, frame_count: %d" % (fps, frame_count))
+
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+        out_path = os.path.join(self.output_dir, video_name)
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+        index = 1
+        while (1):
+            ret, frame = capture.read()
+            if not ret:
+                break
+            print('detect frame: %d' % (index))
+            index += 1
+            results = self.predict_image([frame], visual=False)
+            im = visualize_pose(
+                frame, results, visual_thresh=self.threshold, returnimg=True)
+            writer.write(im)
+            if camera_id != -1:
+                cv2.imshow('Mask Detection', im)
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
+        writer.release()
+
 
 def create_inputs(imgs, im_info):
     """generate input for different model type
@@ -258,90 +339,44 @@ class PredictConfig_KeyPoint():
         print('--------------------------------------------')
 
 
-def predict_image(detector, image_list):
-    for i, img_file in enumerate(image_list):
-        if FLAGS.run_benchmark:
-            # warmup 
-            detector.predict(
-                [img_file], FLAGS.threshold, repeats=10, add_timer=False)
-            # run benchmark
-            detector.predict(
-                [img_file], FLAGS.threshold, repeats=10, add_timer=True)
-            cm, gm, gu = get_current_memory_mb()
-            detector.cpu_mem += cm
-            detector.gpu_mem += gm
-            detector.gpu_util += gu
-            print('Test iter {}, file name:{}'.format(i, img_file))
-        else:
-            results = detector.predict([img_file], FLAGS.threshold)
-            if not os.path.exists(FLAGS.output_dir):
-                os.makedirs(FLAGS.output_dir)
-            draw_pose(
-                img_file,
-                results,
-                visual_thread=FLAGS.threshold,
-                save_dir=FLAGS.output_dir)
-
-
-def predict_video(detector, camera_id):
-    video_name = 'output.mp4'
-    if camera_id != -1:
-        capture = cv2.VideoCapture(camera_id)
-    else:
-        capture = cv2.VideoCapture(FLAGS.video_file)
-        video_name = os.path.split(FLAGS.video_file)[-1]
-    # Get Video info : resolution, fps, frame count
-    width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = int(capture.get(cv2.CAP_PROP_FPS))
-    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
-    print("fps: %d, frame_count: %d" % (fps, frame_count))
-
-    if not os.path.exists(FLAGS.output_dir):
-        os.makedirs(FLAGS.output_dir)
-    out_path = os.path.join(FLAGS.output_dir, video_name + '.mp4')
-    fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
-    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
-    index = 1
-    while (1):
-        ret, frame = capture.read()
-        if not ret:
-            break
-        print('detect frame: %d' % (index))
-        index += 1
-        results = detector.predict([frame], FLAGS.threshold)
-        im = draw_pose(
-            frame, results, visual_thread=FLAGS.threshold, returnimg=True)
-        writer.write(im)
-        if camera_id != -1:
-            cv2.imshow('Mask Detection', im)
-            if cv2.waitKey(1) & 0xFF == ord('q'):
-                break
-    writer.release()
+def visualize(image_list, results, visual_thresh=0.6, save_dir='output'):
+    im_results = {}
+    for i, image_file in enumerate(image_list):
+        skeletons = results['keypoint']
+        scores = results['score']
+        skeleton = skeletons[i:i + 1]
+        score = scores[i:i + 1]
+        im_results['keypoint'] = [skeleton, score]
+        visualize_pose(
+            image_file,
+            im_results,
+            visual_thresh=visual_thresh,
+            save_dir=save_dir)
 
 
 def main():
-    pred_config = PredictConfig_KeyPoint(FLAGS.model_dir)
-    detector = KeyPoint_Detector(
-        pred_config,
+    detector = KeyPointDetector(
         FLAGS.model_dir,
         device=FLAGS.device,
         run_mode=FLAGS.run_mode,
+        batch_size=FLAGS.batch_size,
         trt_min_shape=FLAGS.trt_min_shape,
         trt_max_shape=FLAGS.trt_max_shape,
         trt_opt_shape=FLAGS.trt_opt_shape,
         trt_calib_mode=FLAGS.trt_calib_mode,
         cpu_threads=FLAGS.cpu_threads,
         enable_mkldnn=FLAGS.enable_mkldnn,
+        threshold=FLAGS.threshold,
+        output_dir=FLAGS.output_dir,
         use_dark=FLAGS.use_dark)
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:
-        predict_video(detector, FLAGS.camera_id)
+        detector.predict_video(FLAGS.video_file, FLAGS.camera_id)
     else:
         # predict from image
         img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        predict_image(detector, img_list)
+        detector.predict_image(img_list, FLAGS.run_benchmark, repeats=10)
         if not FLAGS.run_benchmark:
             detector.det_times.info(average=True)
         else:
diff --git a/deploy/python/keypoint_postprocess.py b/deploy/python/keypoint_postprocess.py
index 9ef3201f8..2275df78a 100644
--- a/deploy/python/keypoint_postprocess.py
+++ b/deploy/python/keypoint_postprocess.py
@@ -362,7 +362,8 @@ def affine_transform(pt, t):
 
 
 def translate_to_ori_images(keypoint_result, batch_records):
-    kpts, scores = keypoint_result['keypoint']
+    kpts = keypoint_result['keypoint']
+    scores = keypoint_result['score']
     kpts[..., 0] += batch_records[:, 0:1]
     kpts[..., 1] += batch_records[:, 1:2]
     return kpts, scores
diff --git a/deploy/python/mot_jde_infer.py b/deploy/python/mot_jde_infer.py
index 71b2bcf12..04603b446 100644
--- a/deploy/python/mot_jde_infer.py
+++ b/deploy/python/mot_jde_infer.py
@@ -18,21 +18,24 @@ import yaml
 import cv2
 import numpy as np
 from collections import defaultdict
-
 import paddle
-from paddle.inference import Config
-from paddle.inference import create_predictor
 
-from utils import argsparser, Timer, get_current_memory_mb
-from infer import Detector, get_test_images, print_arguments, PredictConfig
 from benchmark_utils import PaddleInferBenchmark
+from preprocess import decode_image
+from utils import argsparser, Timer, get_current_memory_mb
+from infer import Detector, get_test_images, print_arguments, bench_log, PredictConfig
+
+# add python path
+import sys
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
 
-from ppdet.modeling.mot.tracker import JDETracker
-from ppdet.modeling.mot.visualization import plot_tracking_dict
-from ppdet.modeling.mot.utils import MOTTimer, write_mot_results
+from pptracking.python.mot import JDETracker
+from pptracking.python.mot.utils import MOTTimer, write_mot_results
+from pptracking.python.visualize import plot_tracking, plot_tracking_dict
 
 # Global dictionary
-MOT_SUPPORT_MODELS = {
+MOT_JDE_SUPPORT_MODELS = {
     'JDE',
     'FairMOT',
 }
@@ -41,7 +44,6 @@ MOT_SUPPORT_MODELS = {
 class JDE_Detector(Detector):
     """
     Args:
-        pred_config (object): config of model, defined by `Config(model_dir)`
         model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
         device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
         run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
@@ -56,8 +58,8 @@ class JDE_Detector(Detector):
     """
 
     def __init__(self,
-                 pred_config,
                  model_dir,
+                 tracker_config=None,
                  device='CPU',
                  run_mode='paddle',
                  batch_size=1,
@@ -66,9 +68,10 @@ class JDE_Detector(Detector):
                  trt_opt_shape=608,
                  trt_calib_mode=False,
                  cpu_threads=1,
-                 enable_mkldnn=False):
+                 enable_mkldnn=False,
+                 output_dir='output',
+                 threshold=0.5):
         super(JDE_Detector, self).__init__(
-            pred_config=pred_config,
             model_dir=model_dir,
             device=device,
             run_mode=run_mode,
@@ -78,17 +81,21 @@ class JDE_Detector(Detector):
             trt_opt_shape=trt_opt_shape,
             trt_calib_mode=trt_calib_mode,
             cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn)
-        assert batch_size == 1, "The JDE Detector only supports batch size=1 now"
-        assert pred_config.tracker, "Tracking model should have tracker"
-        self.num_classes = len(pred_config.labels)
-
-        tp = pred_config.tracker
-        min_box_area = tp['min_box_area'] if 'min_box_area' in tp else 200
-        vertical_ratio = tp['vertical_ratio'] if 'vertical_ratio' in tp else 1.6
-        conf_thres = tp['conf_thres'] if 'conf_thres' in tp else 0.
-        tracked_thresh = tp['tracked_thresh'] if 'tracked_thresh' in tp else 0.7
-        metric_type = tp['metric_type'] if 'metric_type' in tp else 'euclidean'
+            enable_mkldnn=enable_mkldnn,
+            output_dir=output_dir,
+            threshold=threshold, )
+        assert batch_size == 1, "MOT model only supports batch_size=1."
+        self.det_times = Timer(with_tracker=True)
+        self.num_classes = len(self.pred_config.labels)
+
+        # tracker config
+        assert self.pred_config.tracker, "The exported JDE Detector model should have tracker."
+        cfg = self.pred_config.tracker
+        min_box_area = cfg.get('min_box_area', 200)
+        vertical_ratio = cfg.get('vertical_ratio', 1.6)
+        conf_thres = cfg.get('conf_thres', 0.0)
+        tracked_thresh = cfg.get('tracked_thresh', 0.7)
+        metric_type = cfg.get('metric_type', 'euclidean')
 
         self.tracker = JDETracker(
             num_classes=self.num_classes,
@@ -98,7 +105,18 @@ class JDE_Detector(Detector):
             tracked_thresh=tracked_thresh,
             metric_type=metric_type)
 
-    def postprocess(self, pred_dets, pred_embs, threshold):
+    def postprocess(self, inputs, result):
+        # postprocess output of predictor
+        np_boxes = result['pred_dets']
+        if np_boxes.shape[0] <= 0:
+            print('[WARNNING] No object detected.')
+            result = {'pred_dets': np.zeros([0, 6]), 'pred_embs': None}
+        result = {k: v for k, v in result.items() if v is not None}
+        return result
+
+    def tracking(self, det_results):
+        pred_dets = det_results['pred_dets']
+        pred_embs = det_results['pred_embs']
         online_targets_dict = self.tracker.update(pred_dets, pred_embs)
 
         online_tlwhs = defaultdict(list)
@@ -110,7 +128,6 @@ class JDE_Detector(Detector):
                 tlwh = t.tlwh
                 tid = t.track_id
                 tscore = t.score
-                if tscore < threshold: continue
                 if tlwh[2] * tlwh[3] <= self.tracker.min_box_area: continue
                 if self.tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
                         3] > self.tracker.vertical_ratio:
@@ -120,178 +137,181 @@ class JDE_Detector(Detector):
                 online_scores[cls_id].append(tscore)
         return online_tlwhs, online_scores, online_ids
 
-    def predict(self, image_list, threshold=0.5, repeats=1, add_timer=True):
+    def predict(self, repeats=1):
         '''
         Args:
-            image_list (list): list of image
-            threshold (float): threshold of predicted box' score
-            repeats (int): repeat number for prediction
-            add_timer (bool): whether add timer during prediction
+            repeats (int): repeats number for prediction
         Returns:
-            online_tlwhs, online_scores, online_ids (dict[np.array])
+            result (dict): include 'pred_dets': np.ndarray: shape:[N,6], N: number of box,
+                            matix element:[x_min, y_min, x_max, y_max, score, class]
+                            FairMOT(JDE)'s result include 'pred_embs': np.ndarray:
+                            shape: [N, 128]
         '''
-        # preprocess
-        if add_timer:
-            self.det_times.preprocess_time_s.start()
-        inputs = self.preprocess(image_list)
-
-        pred_dets, pred_embs = None, None
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        if add_timer:
-            self.det_times.preprocess_time_s.end()
-            self.det_times.inference_time_s.start()
-
         # model prediction
+        np_pred_dets, np_pred_embs = None, None
         for i in range(repeats):
             self.predictor.run()
             output_names = self.predictor.get_output_names()
             boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            pred_dets = boxes_tensor.copy_to_cpu()
+            np_pred_dets = boxes_tensor.copy_to_cpu()
             embs_tensor = self.predictor.get_output_handle(output_names[1])
-            pred_embs = embs_tensor.copy_to_cpu()
-
-        if add_timer:
-            self.det_times.inference_time_s.end(repeats=repeats)
-            self.det_times.postprocess_time_s.start()
-
-        # postprocess
-        online_tlwhs, online_scores, online_ids = self.postprocess(
-            pred_dets, pred_embs, threshold)
-        if add_timer:
-            self.det_times.postprocess_time_s.end()
-            self.det_times.img_num += 1
-        return online_tlwhs, online_scores, online_ids
-
-
-def predict_image(detector, image_list):
-    results = []
-    num_classes = detector.num_classes
-    data_type = 'mcmot' if num_classes > 1 else 'mot'
-    ids2names = detector.pred_config.labels
-
-    image_list.sort()
-    for frame_id, img_file in enumerate(image_list):
-        frame = cv2.imread(img_file)
-        if FLAGS.run_benchmark:
-            # warmup
-            detector.predict(
-                [frame], FLAGS.threshold, repeats=10, add_timer=False)
-            # run benchmark
-            detector.predict(
-                [frame], FLAGS.threshold, repeats=10, add_timer=True)
-            cm, gm, gu = get_current_memory_mb()
-            detector.cpu_mem += cm
-            detector.gpu_mem += gm
-            detector.gpu_util += gu
-            print('Test iter {}, file name:{}'.format(frame_id, img_file))
+            np_pred_embs = embs_tensor.copy_to_cpu()
+
+        result = dict(pred_dets=np_pred_dets, pred_embs=np_pred_embs)
+        return result
+
+    def predict_image(self,
+                      image_list,
+                      run_benchmark=False,
+                      repeats=1,
+                      visual=True):
+        mot_results = []
+        num_classes = self.num_classes
+        image_list.sort()
+        ids2names = self.pred_config.labels
+        data_type = 'mcmot' if num_classes > 1 else 'mot'
+        for frame_id, img_file in enumerate(image_list):
+            batch_image_list = [img_file]  # bs=1 in MOT model
+            if run_benchmark:
+                # preprocess
+                inputs = self.preprocess(batch_image_list)  # warmup
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                result_warmup = self.predict(repeats=repeats)  # warmup
+                self.det_times.inference_time_s.start()
+                result = self.predict(repeats=repeats)
+                self.det_times.inference_time_s.end(repeats=repeats)
+
+                # postprocess
+                result_warmup = self.postprocess(inputs, result)  # warmup
+                self.det_times.postprocess_time_s.start()
+                det_result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+
+                # tracking
+                result_warmup = self.tracking(det_result)
+                self.det_times.tracking_time_s.start()
+                online_tlwhs, online_scores, online_ids = self.tracking(
+                    det_result)
+                self.det_times.tracking_time_s.end()
+                self.det_times.img_num += 1
+
+                cm, gm, gu = get_current_memory_mb()
+                self.cpu_mem += cm
+                self.gpu_mem += gm
+                self.gpu_util += gu
+
+            else:
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                self.det_times.inference_time_s.start()
+                result = self.predict()
+                self.det_times.inference_time_s.end()
+
+                self.det_times.postprocess_time_s.start()
+                det_result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+
+                # tracking process
+                self.det_times.tracking_time_s.start()
+                online_tlwhs, online_scores, online_ids = self.tracking(
+                    det_result)
+                self.det_times.tracking_time_s.end()
+                self.det_times.img_num += 1
+
+            if visual:
+                if frame_id % 10 == 0:
+                    print('Tracking frame {}'.format(frame_id))
+                frame, _ = decode_image(img_file, {})
+
+                im = plot_tracking_dict(
+                    frame,
+                    num_classes,
+                    online_tlwhs,
+                    online_ids,
+                    online_scores,
+                    frame_id=frame_id,
+                    ids2names=ids2names)
+                seq_name = image_list[0].split('/')[-2]
+                save_dir = os.path.join(self.output_dir, seq_name)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir)
+                cv2.imwrite(
+                    os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), im)
+
+            mot_results.append([online_tlwhs, online_scores, online_ids])
+        return mot_results
+
+    def predict_video(self, video_file, camera_id):
+        video_out_name = 'mot_output.mp4'
+        if camera_id != -1:
+            capture = cv2.VideoCapture(camera_id)
         else:
-            online_tlwhs, online_scores, online_ids = detector.predict(
-                [frame], FLAGS.threshold)
-            online_im = plot_tracking_dict(
+            capture = cv2.VideoCapture(video_file)
+            video_out_name = os.path.split(video_file)[-1]
+        # Get Video info : resolution, fps, frame count
+        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = int(capture.get(cv2.CAP_PROP_FPS))
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        print("fps: %d, frame_count: %d" % (fps, frame_count))
+
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+        out_path = os.path.join(self.output_dir, video_out_name)
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+
+        frame_id = 1
+        timer = MOTTimer()
+        results = defaultdict(list)  # support single class and multi classes
+        num_classes = self.num_classes
+        data_type = 'mcmot' if num_classes > 1 else 'mot'
+        ids2names = self.pred_config.labels
+        while (1):
+            ret, frame = capture.read()
+            if not ret:
+                break
+            if frame_id % 10 == 0:
+                print('Tracking frame: %d' % (frame_id))
+            frame_id += 1
+
+            timer.tic()
+            mot_results = self.predict_image([frame], visual=False)
+            timer.toc()
+
+            online_tlwhs, online_scores, online_ids = mot_results[0]
+            for cls_id in range(num_classes):
+                results[cls_id].append(
+                    (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id],
+                     online_ids[cls_id]))
+
+            fps = 1. / timer.duration
+            im = plot_tracking_dict(
                 frame,
                 num_classes,
                 online_tlwhs,
                 online_ids,
                 online_scores,
-                frame_id,
+                frame_id=frame_id,
+                fps=fps,
                 ids2names=ids2names)
-            if FLAGS.save_images:
-                if not os.path.exists(FLAGS.output_dir):
-                    os.makedirs(FLAGS.output_dir)
-                img_name = os.path.split(img_file)[-1]
-                out_path = os.path.join(FLAGS.output_dir, img_name)
-                cv2.imwrite(out_path, online_im)
-                print("save result to: " + out_path)
-
-
-def predict_video(detector, camera_id):
-    video_name = 'mot_output.mp4'
-    if camera_id != -1:
-        capture = cv2.VideoCapture(camera_id)
-    else:
-        capture = cv2.VideoCapture(FLAGS.video_file)
-        video_name = os.path.split(FLAGS.video_file)[-1]
-    # Get Video info : resolution, fps, frame count
-    width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = int(capture.get(cv2.CAP_PROP_FPS))
-    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
-    print("fps: %d, frame_count: %d" % (fps, frame_count))
-
-    if not os.path.exists(FLAGS.output_dir):
-        os.makedirs(FLAGS.output_dir)
-    out_path = os.path.join(FLAGS.output_dir, video_name)
-    if not FLAGS.save_images:
-        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
-        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
-    frame_id = 0
-    timer = MOTTimer()
-    results = defaultdict(list)  # support single class and multi classes
-    num_classes = detector.num_classes
-    data_type = 'mcmot' if num_classes > 1 else 'mot'
-    ids2names = detector.pred_config.labels
-
-    while (1):
-        ret, frame = capture.read()
-        if not ret:
-            break
-        timer.tic()
-        online_tlwhs, online_scores, online_ids = detector.predict(
-            [frame], FLAGS.threshold)
-        timer.toc()
-
-        for cls_id in range(num_classes):
-            results[cls_id].append((frame_id + 1, online_tlwhs[cls_id],
-                                    online_scores[cls_id], online_ids[cls_id]))
-
-        fps = 1. / timer.average_time
-        im = plot_tracking_dict(
-            frame,
-            num_classes,
-            online_tlwhs,
-            online_ids,
-            online_scores,
-            frame_id=frame_id,
-            fps=fps,
-            ids2names=ids2names)
-        if FLAGS.save_images:
-            save_dir = os.path.join(FLAGS.output_dir, video_name.split('.')[-2])
-            if not os.path.exists(save_dir):
-                os.makedirs(save_dir)
-            cv2.imwrite(
-                os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), im)
-        else:
-            writer.write(im)
 
-        frame_id += 1
-        print('detect frame: %d' % (frame_id))
-        if camera_id != -1:
-            cv2.imshow('Tracking Detection', im)
-            if cv2.waitKey(1) & 0xFF == ord('q'):
-                break
-    if FLAGS.save_mot_txts:
-        result_filename = os.path.join(FLAGS.output_dir,
-                                       video_name.split('.')[-2] + '.txt')
-
-        write_mot_results(result_filename, results, data_type, num_classes)
-
-    if FLAGS.save_images:
-        save_dir = os.path.join(FLAGS.output_dir, video_name.split('.')[-2])
-        cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(save_dir,
-                                                              out_path)
-        os.system(cmd_str)
-        print('Save video in {}.'.format(out_path))
-    else:
+            writer.write(im)
+            if camera_id != -1:
+                cv2.imshow('Mask Detection', im)
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
         writer.release()
 
 
 def main():
-    pred_config = PredictConfig(FLAGS.model_dir)
     detector = JDE_Detector(
-        pred_config,
         FLAGS.model_dir,
         device=FLAGS.device,
         run_mode=FLAGS.run_mode,
@@ -304,34 +324,22 @@ def main():
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:
-        predict_video(detector, FLAGS.camera_id)
+        detector.predict_video(FLAGS.video_file, FLAGS.camera_id)
     else:
         # predict from image
         img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        predict_image(detector, img_list)
+        detector.predict_image(img_list, FLAGS.run_benchmark, repeats=10)
+
         if not FLAGS.run_benchmark:
             detector.det_times.info(average=True)
         else:
-            mems = {
-                'cpu_rss_mb': detector.cpu_mem / len(img_list),
-                'gpu_rss_mb': detector.gpu_mem / len(img_list),
-                'gpu_util': detector.gpu_util * 100 / len(img_list)
-            }
-            perf_info = detector.det_times.report(average=True)
-            model_dir = FLAGS.model_dir
             mode = FLAGS.run_mode
+            model_dir = FLAGS.model_dir
             model_info = {
                 'model_name': model_dir.strip('/').split('/')[-1],
                 'precision': mode.split('_')[-1]
             }
-            data_info = {
-                'batch_size': 1,
-                'shape': "dynamic_shape",
-                'data_num': perf_info['img_num']
-            }
-            det_log = PaddleInferBenchmark(detector.config, model_info,
-                                           data_info, perf_info, mems)
-            det_log('MOT')
+            bench_log(detector, img_list, model_info, name='MOT')
 
 
 if __name__ == '__main__':
diff --git a/deploy/python/mot_keypoint_unite_infer.py b/deploy/python/mot_keypoint_unite_infer.py
index 70f03db76..dee9c172e 100644
--- a/deploy/python/mot_keypoint_unite_infer.py
+++ b/deploy/python/mot_keypoint_unite_infer.py
@@ -13,31 +13,34 @@
 # limitations under the License.
 
 import os
+import json
 import cv2
 import math
-import copy
 import numpy as np
-from collections import defaultdict
 import paddle
-
-from utils import get_current_memory_mb
-from infer import Detector, PredictConfig, print_arguments, get_test_images
-from visualize import draw_pose
+import yaml
+import copy
+from collections import defaultdict
 
 from mot_keypoint_unite_utils import argsparser
-from keypoint_infer import KeyPoint_Detector, PredictConfig_KeyPoint
-from det_keypoint_unite_infer import predict_with_given_det, bench_log
-from mot_jde_infer import JDE_Detector
+from preprocess import decode_image
+from infer import print_arguments, get_test_images
+from mot_sde_infer import SDE_Detector, MOT_SDE_SUPPORT_MODELS
+from mot_jde_infer import JDE_Detector, MOT_JDE_SUPPORT_MODELS
+from keypoint_infer import KeyPointDetector, KEYPOINT_SUPPORT_MODELS
+from det_keypoint_unite_infer import predict_with_given_det
+from visualize import visualize_pose
+from benchmark_utils import PaddleInferBenchmark
+from utils import get_current_memory_mb
+from keypoint_postprocess import translate_to_ori_images
 
-from ppdet.modeling.mot.visualization import plot_tracking_dict
-from ppdet.modeling.mot.utils import MOTTimer as FPSTimer
-from ppdet.modeling.mot.utils import write_mot_results
+# add python path
+import sys
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
 
-# Global dictionary
-KEYPOINT_SUPPORT_MODELS = {
-    'HigherHRNet': 'keypoint_bottomup',
-    'HRNet': 'keypoint_topdown'
-}
+from pptracking.python.visualize import plot_tracking, plot_tracking_dict
+from pptracking.python.mot.utils import MOTTimer as FPSTimer
 
 
 def convert_mot_to_det(tlwhs, scores):
@@ -49,94 +52,87 @@ def convert_mot_to_det(tlwhs, scores):
     # support single class now
     results['boxes'] = np.vstack(
         [np.hstack([0, scores[i], xyxys[i]]) for i in range(num_mot)])
+    results['boxes_num'] = np.array([num_mot])
     return results
 
 
-def mot_keypoint_unite_predict_image(mot_model,
-                                     keypoint_model,
-                                     image_list,
-                                     keypoint_batch_size=1):
-    num_classes = mot_model.num_classes
-    assert num_classes == 1, 'Only one category mot model supported for uniting keypoint deploy.'
-    data_type = 'mot'
+def mot_topdown_unite_predict(mot_detector,
+                              topdown_keypoint_detector,
+                              image_list,
+                              keypoint_batch_size=1,
+                              save_res=False):
+    det_timer = mot_detector.get_timer()
+    store_res = []
     image_list.sort()
+    num_classes = mot_detector.num_classes
     for i, img_file in enumerate(image_list):
-        frame = cv2.imread(img_file)
+        # Decode image in advance in mot + pose prediction
+        det_timer.preprocess_time_s.start()
+        image, _ = decode_image(img_file, {})
+        det_timer.preprocess_time_s.end()
 
         if FLAGS.run_benchmark:
-            # warmup
-            online_tlwhs, online_scores, online_ids = mot_model.predict(
-                [frame], FLAGS.mot_threshold, repeats=10, add_timer=False)
-            # run benchmark
-            online_tlwhs, online_scores, online_ids = mot_model.predict(
-                [frame], FLAGS.mot_threshold, repeats=10, add_timer=True)
-            cm, gm, gu = get_current_memory_mb()
-            mot_model.cpu_mem += cm
-            mot_model.gpu_mem += gm
-            mot_model.gpu_util += gu
-
-        else:
-            online_tlwhs, online_scores, online_ids = mot_model.predict(
-                [frame], FLAGS.mot_threshold)
-
-        keypoint_arch = keypoint_model.pred_config.arch
-        if KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown':
-            results = convert_mot_to_det(online_tlwhs, online_scores)
-            keypoint_results = predict_with_given_det(
-                frame, results, keypoint_model, keypoint_batch_size,
-                FLAGS.mot_threshold, FLAGS.keypoint_threshold,
-                FLAGS.run_benchmark)
+            mot_results = mot_detector.predict_image(
+                [image], run_benchmark=True, repeats=10)
 
+            cm, gm, gu = get_current_memory_mb()
+            mot_detector.cpu_mem += cm
+            mot_detector.gpu_mem += gm
+            mot_detector.gpu_util += gu
         else:
-            if FLAGS.run_benchmark:
-                keypoint_results = keypoint_model.predict(
-                    [frame],
-                    FLAGS.keypoint_threshold,
-                    repeats=10,
-                    add_timer=False)
-
-            repeats = 10 if FLAGS.run_benchmark else 1
-            keypoint_results = keypoint_model.predict(
-                [frame], FLAGS.keypoint_threshold, repeats=repeats)
-
+            mot_results = mot_detector.predict_image([image], visual=False)
+
+        online_tlwhs, online_scores, online_ids = mot_results[
+            0]  # only support bs=1 in MOT model
+        results = convert_mot_to_det(
+            online_tlwhs[0],
+            online_scores[0])  # only support single class for mot + pose
+        if results['boxes_num'] == 0:
+            continue
+
+        keypoint_res = predict_with_given_det(
+            image, results, topdown_keypoint_detector, keypoint_batch_size,
+            FLAGS.mot_threshold, FLAGS.keypoint_threshold, FLAGS.run_benchmark)
+
+        if save_res:
+            store_res.append([
+                i, keypoint_res['bbox'],
+                [keypoint_res['keypoint'][0], keypoint_res['keypoint'][1]]
+            ])
         if FLAGS.run_benchmark:
             cm, gm, gu = get_current_memory_mb()
-            keypoint_model.cpu_mem += cm
-            keypoint_model.gpu_mem += gm
-            keypoint_model.gpu_util += gu
+            topdown_keypoint_detector.cpu_mem += cm
+            topdown_keypoint_detector.gpu_mem += gm
+            topdown_keypoint_detector.gpu_util += gu
         else:
-            im = draw_pose(
-                frame,
-                keypoint_results,
-                visual_thread=FLAGS.keypoint_threshold,
-                returnimg=True,
-                ids=online_ids[0]
-                if KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown'
-                else None)
-
-            online_im = plot_tracking_dict(
-                im,
-                num_classes,
-                online_tlwhs,
-                online_ids,
-                online_scores,
-                frame_id=i)
-            if FLAGS.save_images:
-                if not os.path.exists(FLAGS.output_dir):
-                    os.makedirs(FLAGS.output_dir)
-                img_name = os.path.split(img_file)[-1]
-                out_path = os.path.join(FLAGS.output_dir, img_name)
-                cv2.imwrite(out_path, online_im)
-                print("save result to: " + out_path)
-
-
-def mot_keypoint_unite_predict_video(mot_model,
-                                     keypoint_model,
-                                     camera_id,
-                                     keypoint_batch_size=1):
+            if not os.path.exists(FLAGS.output_dir):
+                os.makedirs(FLAGS.output_dir)
+            visualize_pose(
+                img_file,
+                keypoint_res,
+                visual_thresh=FLAGS.keypoint_threshold,
+                save_dir=FLAGS.output_dir)
+
+    if save_res:
+        """
+        1) store_res: a list of image_data
+        2) image_data: [imageid, rects, [keypoints, scores]]
+        3) rects: list of rect [xmin, ymin, xmax, ymax]
+        4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list
+        5) scores: mean of all joint conf
+        """
+        with open("det_keypoint_unite_image_results.json", 'w') as wf:
+            json.dump(store_res, wf, indent=4)
+
+
+def mot_topdown_unite_predict_video(mot_detector,
+                                    topdown_keypoint_detector,
+                                    camera_id,
+                                    keypoint_batch_size=1,
+                                    save_res=False):
+    video_name = 'output.mp4'
     if camera_id != -1:
         capture = cv2.VideoCapture(camera_id)
-        video_name = 'output.mp4'
     else:
         capture = cv2.VideoCapture(FLAGS.video_file)
         video_name = os.path.split(FLAGS.video_file)[-1]
@@ -150,17 +146,12 @@ def mot_keypoint_unite_predict_video(mot_model,
     if not os.path.exists(FLAGS.output_dir):
         os.makedirs(FLAGS.output_dir)
     out_path = os.path.join(FLAGS.output_dir, video_name)
-    if not FLAGS.save_images:
-        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
-        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
     frame_id = 0
-    timer_mot = FPSTimer()
-    timer_kp = FPSTimer()
-    timer_mot_kp = FPSTimer()
+    timer_mot, timer_kp, timer_mot_kp = FPSTimer(), FPSTimer(), FPSTimer()
 
-    # support single class and multi classes, but should be single class here
-    mot_results = defaultdict(list)
-    num_classes = mot_model.num_classes
+    num_classes = mot_detector.num_classes
     assert num_classes == 1, 'Only one category mot model supported for uniting keypoint deploy.'
     data_type = 'mot'
 
@@ -168,43 +159,41 @@ def mot_keypoint_unite_predict_video(mot_model,
         ret, frame = capture.read()
         if not ret:
             break
+        if frame_id % 10 == 0:
+            print('Tracking frame: %d' % (frame_id))
+        frame_id += 1
         timer_mot_kp.tic()
+
+        # mot model
         timer_mot.tic()
-        online_tlwhs, online_scores, online_ids = mot_model.predict(
-            [frame], FLAGS.mot_threshold)
+        mot_results = mot_detector.predict_image([frame], visual=False)
         timer_mot.toc()
-        mot_results[0].append(
-            (frame_id + 1, online_tlwhs[0], online_scores[0], online_ids[0]))
-        mot_fps = 1. / timer_mot.average_time
-
+        online_tlwhs, online_scores, online_ids = mot_results[0]
+        results = convert_mot_to_det(
+            online_tlwhs[0],
+            online_scores[0])  # only support single class for mot + pose
+        if results['boxes_num'] == 0:
+            continue
+
+        # keypoint model
         timer_kp.tic()
-
-        keypoint_arch = keypoint_model.pred_config.arch
-        if KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown':
-            results = convert_mot_to_det(online_tlwhs[0], online_scores[0])
-            keypoint_results = predict_with_given_det(
-                frame, results, keypoint_model, keypoint_batch_size,
-                FLAGS.mot_threshold, FLAGS.keypoint_threshold,
-                FLAGS.run_benchmark)
-
-        else:
-            keypoint_results = keypoint_model.predict([frame],
-                                                      FLAGS.keypoint_threshold)
+        keypoint_res = predict_with_given_det(
+            frame, results, topdown_keypoint_detector, keypoint_batch_size,
+            FLAGS.mot_threshold, FLAGS.keypoint_threshold, FLAGS.run_benchmark)
         timer_kp.toc()
         timer_mot_kp.toc()
-        kp_fps = 1. / timer_kp.average_time
-        mot_kp_fps = 1. / timer_mot_kp.average_time
 
-        im = draw_pose(
+        kp_fps = 1. / timer_kp.duration
+        mot_kp_fps = 1. / timer_mot_kp.duration
+
+        im = visualize_pose(
             frame,
-            keypoint_results,
-            visual_thread=FLAGS.keypoint_threshold,
+            keypoint_res,
+            visual_thresh=FLAGS.keypoint_threshold,
             returnimg=True,
-            ids=online_ids[0]
-            if KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown' else
-            None)
+            ids=online_ids[0])
 
-        online_im = plot_tracking_dict(
+        im = plot_tracking_dict(
             im,
             num_classes,
             online_tlwhs,
@@ -213,55 +202,40 @@ def mot_keypoint_unite_predict_video(mot_model,
             frame_id=frame_id,
             fps=mot_kp_fps)
 
-        im = np.array(online_im)
-
-        frame_id += 1
-        print('detect frame: %d' % (frame_id))
-
-        if FLAGS.save_images:
-            save_dir = os.path.join(FLAGS.output_dir, video_name.split('.')[-2])
-            if not os.path.exists(save_dir):
-                os.makedirs(save_dir)
-            cv2.imwrite(
-                os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), im)
-        else:
-            writer.write(im)
+        writer.write(im)
         if camera_id != -1:
             cv2.imshow('Tracking and keypoint results', im)
             if cv2.waitKey(1) & 0xFF == ord('q'):
                 break
-    if FLAGS.save_mot_txts:
-        result_filename = os.path.join(FLAGS.output_dir,
-                                       video_name.split('.')[-2] + '.txt')
-        write_mot_results(result_filename, mot_results, data_type, num_classes)
-
-    if FLAGS.save_images:
-        save_dir = os.path.join(FLAGS.output_dir, video_name.split('.')[-2])
-        cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(save_dir,
-                                                              out_path)
-        os.system(cmd_str)
-        print('Save video in {}.'.format(out_path))
-    else:
-        writer.release()
 
+    writer.release()
+    print('output_video saved to: {}'.format(out_path))
 
-def main():
-    pred_config = PredictConfig(FLAGS.mot_model_dir)
-    mot_model = JDE_Detector(
-        pred_config,
-        FLAGS.mot_model_dir,
-        device=FLAGS.device,
-        run_mode=FLAGS.run_mode,
-        trt_min_shape=FLAGS.trt_min_shape,
-        trt_max_shape=FLAGS.trt_max_shape,
-        trt_opt_shape=FLAGS.trt_opt_shape,
-        trt_calib_mode=FLAGS.trt_calib_mode,
-        cpu_threads=FLAGS.cpu_threads,
-        enable_mkldnn=FLAGS.enable_mkldnn)
 
-    pred_config = PredictConfig_KeyPoint(FLAGS.keypoint_model_dir)
-    keypoint_model = KeyPoint_Detector(
-        pred_config,
+def main():
+    deploy_file = os.path.join(FLAGS.mot_model_dir, 'infer_cfg.yml')
+    with open(deploy_file) as f:
+        yml_conf = yaml.safe_load(f)
+    arch = yml_conf['arch']
+    mot_detector_func = 'SDE_Detector'
+    if arch in MOT_JDE_SUPPORT_MODELS:
+        mot_detector_func = 'JDE_Detector'
+
+    mot_detector = eval(mot_detector_func)(FLAGS.mot_model_dir,
+                                           FLAGS.tracker_config,
+                                           device=FLAGS.device,
+                                           run_mode=FLAGS.run_mode,
+                                           batch_size=1,
+                                           trt_min_shape=FLAGS.trt_min_shape,
+                                           trt_max_shape=FLAGS.trt_max_shape,
+                                           trt_opt_shape=FLAGS.trt_opt_shape,
+                                           trt_calib_mode=FLAGS.trt_calib_mode,
+                                           cpu_threads=FLAGS.cpu_threads,
+                                           enable_mkldnn=FLAGS.enable_mkldnn,
+                                           threshold=FLAGS.mot_threshold,
+                                           output_dir=FLAGS.output_dir)
+
+    topdown_keypoint_detector = KeyPointDetector(
         FLAGS.keypoint_model_dir,
         device=FLAGS.device,
         run_mode=FLAGS.run_mode,
@@ -272,22 +246,27 @@ def main():
         trt_calib_mode=FLAGS.trt_calib_mode,
         cpu_threads=FLAGS.cpu_threads,
         enable_mkldnn=FLAGS.enable_mkldnn,
+        threshold=FLAGS.keypoint_threshold,
+        output_dir=FLAGS.output_dir,
         use_dark=FLAGS.use_dark)
+    keypoint_arch = topdown_keypoint_detector.pred_config.arch
+    assert KEYPOINT_SUPPORT_MODELS[
+        keypoint_arch] == 'keypoint_topdown', 'MOT-Keypoint unite inference only supports topdown models.'
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:
-        mot_keypoint_unite_predict_video(mot_model, keypoint_model,
-                                         FLAGS.camera_id,
-                                         FLAGS.keypoint_batch_size)
+        mot_topdown_unite_predict_video(
+            mot_detector, topdown_keypoint_detector, FLAGS.camera_id,
+            FLAGS.keypoint_batch_size, FLAGS.save_res)
     else:
         # predict from image
         img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        mot_keypoint_unite_predict_image(mot_model, keypoint_model, img_list,
-                                         FLAGS.keypoint_batch_size)
-
+        mot_topdown_unite_predict(mot_detector, topdown_keypoint_detector,
+                                  img_list, FLAGS.keypoint_batch_size,
+                                  FLAGS.save_res)
         if not FLAGS.run_benchmark:
-            mot_model.det_times.info(average=True)
-            keypoint_model.det_times.info(average=True)
+            mot_detector.det_times.info(average=True)
+            topdown_keypoint_detector.det_times.info(average=True)
         else:
             mode = FLAGS.run_mode
             mot_model_dir = FLAGS.mot_model_dir
@@ -295,14 +274,15 @@ def main():
                 'model_name': mot_model_dir.strip('/').split('/')[-1],
                 'precision': mode.split('_')[-1]
             }
-            bench_log(mot_model, img_list, mot_model_info, name='MOT')
+            bench_log(mot_detector, img_list, mot_model_info, name='MOT')
 
             keypoint_model_dir = FLAGS.keypoint_model_dir
             keypoint_model_info = {
                 'model_name': keypoint_model_dir.strip('/').split('/')[-1],
                 'precision': mode.split('_')[-1]
             }
-            bench_log(keypoint_model, img_list, keypoint_model_info, 'KeyPoint')
+            bench_log(topdown_keypoint_detector, img_list, keypoint_model_info,
+                      FLAGS.keypoint_batch_size, 'KeyPoint')
 
 
 if __name__ == '__main__':
diff --git a/deploy/python/mot_keypoint_unite_utils.py b/deploy/python/mot_keypoint_unite_utils.py
index 91a74638b..246f46fe9 100644
--- a/deploy/python/mot_keypoint_unite_utils.py
+++ b/deploy/python/mot_keypoint_unite_utils.py
@@ -123,5 +123,17 @@ def argsparser():
         type=bool,
         default=True,
         help='whether to use darkpose to get better keypoint position predict ')
-
+    parser.add_argument(
+        '--save_res',
+        type=bool,
+        default=False,
+        help=(
+            "whether to save predict results to json file"
+            "1) store_res: a list of image_data"
+            "2) image_data: [imageid, rects, [keypoints, scores]]"
+            "3) rects: list of rect [xmin, ymin, xmax, ymax]"
+            "4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list"
+            "5) scores: mean of all joint conf"))
+    parser.add_argument(
+        "--tracker_config", type=str, default=None, help=("tracker donfig"))
     return parser
diff --git a/deploy/python/mot_sde_infer.py b/deploy/python/mot_sde_infer.py
index 23744b9cc..f54c8c312 100644
--- a/deploy/python/mot_sde_infer.py
+++ b/deploy/python/mot_sde_infer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,93 +18,38 @@ import yaml
 import cv2
 import numpy as np
 from collections import defaultdict
-
 import paddle
-from paddle.inference import Config
-from paddle.inference import create_predictor
 
-from picodet_postprocess import PicoDetPostProcess
-from utils import argsparser, Timer, get_current_memory_mb
-from infer import Detector, DetectorPicoDet, get_test_images, print_arguments, PredictConfig
-from infer import load_predictor
 from benchmark_utils import PaddleInferBenchmark
+from preprocess import decode_image
+from utils import argsparser, Timer, get_current_memory_mb
+from infer import Detector, get_test_images, print_arguments, bench_log, PredictConfig
 
-from ppdet.modeling.mot.tracker import DeepSORTTracker
-from ppdet.modeling.mot.visualization import plot_tracking
-from ppdet.modeling.mot.utils import MOTTimer, write_mot_results
+# add python path
+import sys
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+from pptracking.python.mot import JDETracker
+from pptracking.python.mot.utils import MOTTimer, write_mot_results
+from pptracking.python.visualize import plot_tracking, plot_tracking_dict
 
 # Global dictionary
-MOT_SUPPORT_MODELS = {'DeepSORT'}
-
-
-def bench_log(detector, img_list, model_info, batch_size=1, name=None):
-    mems = {
-        'cpu_rss_mb': detector.cpu_mem / len(img_list),
-        'gpu_rss_mb': detector.gpu_mem / len(img_list),
-        'gpu_util': detector.gpu_util * 100 / len(img_list)
-    }
-    perf_info = detector.det_times.report(average=True)
-    data_info = {
-        'batch_size': batch_size,
-        'shape': "dynamic_shape",
-        'data_num': perf_info['img_num']
-    }
-    log = PaddleInferBenchmark(detector.config, model_info, data_info,
-                               perf_info, mems)
-    log(name)
-
-
-def scale_coords(coords, input_shape, im_shape, scale_factor):
-    im_shape = im_shape[0]
-    ratio = scale_factor[0][0]
-    pad_w = (input_shape[1] - int(im_shape[1])) / 2
-    pad_h = (input_shape[0] - int(im_shape[0])) / 2
-    coords[:, 0::2] -= pad_w
-    coords[:, 1::2] -= pad_h
-    coords[:, 0:4] /= ratio
-    coords[:, :4] = np.clip(coords[:, :4], a_min=0, a_max=coords[:, :4].max())
-    return coords.round()
-
-
-def clip_box(xyxy, input_shape, im_shape, scale_factor):
-    im_shape = im_shape[0]
-    ratio = scale_factor[0][0]
-    img0_shape = [int(im_shape[0] / ratio), int(im_shape[1] / ratio)]
-    xyxy[:, 0::2] = np.clip(xyxy[:, 0::2], a_min=0, a_max=img0_shape[1])
-    xyxy[:, 1::2] = np.clip(xyxy[:, 1::2], a_min=0, a_max=img0_shape[0])
-    w = xyxy[:, 2:3] - xyxy[:, 0:1]
-    h = xyxy[:, 3:4] - xyxy[:, 1:2]
-    mask = np.logical_and(h > 0, w > 0)
-    keep_idx = np.nonzero(mask)
-    return xyxy[keep_idx[0]], keep_idx
-
-
-def preprocess_reid(imgs,
-                    w=64,
-                    h=192,
-                    mean=[0.485, 0.456, 0.406],
-                    std=[0.229, 0.224, 0.225]):
-    im_batch = []
-    for img in imgs:
-        img = cv2.resize(img, (w, h))
-        img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
-        img_mean = np.array(mean).reshape((3, 1, 1))
-        img_std = np.array(std).reshape((3, 1, 1))
-        img -= img_mean
-        img /= img_std
-        img = np.expand_dims(img, axis=0)
-        im_batch.append(img)
-    im_batch = np.concatenate(im_batch, 0)
-    return im_batch
+MOT_SDE_SUPPORT_MODELS = {
+    'DeepSORT',
+    'ByteTrack',
+    'YOLO',
+}
 
 
 class SDE_Detector(Detector):
     """
     Args:
-        pred_config (object): config of model, defined by `Config(model_dir)`
         model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
+        tracker_config (str): tracker config path
         device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
         run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
+        batch_size (int): size of pre batch in inference
         trt_min_shape (int): min shape for dynamic shape in trt
         trt_max_shape (int): max shape for dynamic shape in trt
         trt_opt_shape (int): opt shape for dynamic shape in trt
@@ -112,22 +57,24 @@ class SDE_Detector(Detector):
             calibration, trt_calib_mode need to set True
         cpu_threads (int): cpu threads
         enable_mkldnn (bool): whether to open MKLDNN
+        use_dark(bool): whether to use postprocess in DarkPose
     """
 
     def __init__(self,
-                 pred_config,
                  model_dir,
+                 tracker_config,
                  device='CPU',
                  run_mode='paddle',
                  batch_size=1,
                  trt_min_shape=1,
-                 trt_max_shape=1088,
-                 trt_opt_shape=608,
+                 trt_max_shape=1280,
+                 trt_opt_shape=640,
                  trt_calib_mode=False,
                  cpu_threads=1,
-                 enable_mkldnn=False):
+                 enable_mkldnn=False,
+                 output_dir='output',
+                 threshold=0.5):
         super(SDE_Detector, self).__init__(
-            pred_config=pred_config,
             model_dir=model_dir,
             device=device,
             run_mode=run_mode,
@@ -137,561 +84,248 @@ class SDE_Detector(Detector):
             trt_opt_shape=trt_opt_shape,
             trt_calib_mode=trt_calib_mode,
             cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn)
-        assert batch_size == 1, "The JDE Detector only supports batch size=1 now"
-        self.pred_config = pred_config
-
-    def postprocess(self, boxes, input_shape, im_shape, scale_factor, threshold,
-                    scaled):
-        over_thres_idx = np.nonzero(boxes[:, 1:2] >= threshold)[0]
-        if len(over_thres_idx) == 0:
-            pred_dets = np.zeros((1, 6), dtype=np.float32)
-            pred_xyxys = np.zeros((1, 4), dtype=np.float32)
-            return pred_dets, pred_xyxys
-        else:
-            boxes = boxes[over_thres_idx]
-
-        if not scaled:
-            # scaled means whether the coords after detector outputs
-            # have been scaled back to the original image, set True 
-            # in general detector, set False in JDE YOLOv3.
-            pred_bboxes = scale_coords(boxes[:, 2:], input_shape, im_shape,
-                                       scale_factor)
-        else:
-            pred_bboxes = boxes[:, 2:]
-
-        pred_xyxys, keep_idx = clip_box(pred_bboxes, input_shape, im_shape,
-                                        scale_factor)
-        if len(keep_idx[0]) == 0:
-            pred_dets = np.zeros((1, 6), dtype=np.float32)
-            pred_xyxys = np.zeros((1, 4), dtype=np.float32)
-            return pred_dets, pred_xyxys
-
-        pred_scores = boxes[:, 1:2][keep_idx[0]]
-        pred_cls_ids = boxes[:, 0:1][keep_idx[0]]
-        pred_tlwhs = np.concatenate(
-            (pred_xyxys[:, 0:2], pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1),
-            axis=1)
-
+            enable_mkldnn=enable_mkldnn,
+            output_dir=output_dir,
+            threshold=threshold, )
+        assert batch_size == 1, "MOT model only supports batch_size=1."
+        self.det_times = Timer(with_tracker=True)
+        self.num_classes = len(self.pred_config.labels)
+
+        # tracker config
+        self.tracker_config = tracker_config
+        cfg = yaml.safe_load(open(self.tracker_config))['tracker']
+        min_box_area = cfg.get('min_box_area', 200)
+        vertical_ratio = cfg.get('vertical_ratio', 1.6)
+        use_byte = cfg.get('use_byte', True)
+        match_thres = cfg.get('match_thres', 0.9)
+        conf_thres = cfg.get('conf_thres', 0.6)
+        low_conf_thres = cfg.get('low_conf_thres', 0.1)
+
+        self.tracker = JDETracker(
+            use_byte=use_byte,
+            num_classes=self.num_classes,
+            min_box_area=min_box_area,
+            vertical_ratio=vertical_ratio,
+            match_thres=match_thres,
+            conf_thres=conf_thres,
+            low_conf_thres=low_conf_thres)
+
+    def tracking(self, det_results):
+        pred_dets = det_results['boxes']
+        pred_embs = None
         pred_dets = np.concatenate(
-            (pred_tlwhs, pred_scores, pred_cls_ids), axis=1)
-
-        return pred_dets, pred_xyxys
-
-    def predict(self, image, scaled, threshold=0.5, repeats=1, add_timer=True):
-        '''
-        Args:
-            image (np.ndarray): image numpy data
-            scaled (bool): whether the coords after detector outputs are scaled,
-                default False in jde yolov3, set True in general detector.
-            threshold (float): threshold of predicted box' score
-            repeats (int): repeat number for prediction
-            add_timer (bool): whether add timer during prediction
-        Returns:
-            pred_dets (np.ndarray, [N, 6])
-        '''
-        # preprocess
-        if add_timer:
-            self.det_times.preprocess_time_s.start()
-        inputs = self.preprocess(image)
-
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(inputs[input_names[i]])
-
-        if add_timer:
-            self.det_times.preprocess_time_s.end()
-            self.det_times.inference_time_s.start()
-        # model prediction
-        for i in range(repeats):
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            boxes = boxes_tensor.copy_to_cpu()
-
-        if add_timer:
-            self.det_times.inference_time_s.end(repeats=repeats)
-            self.det_times.postprocess_time_s.start()
-
-        # postprocess
-        if len(boxes) == 0:
-            pred_dets = np.zeros((1, 6), dtype=np.float32)
-            pred_xyxys = np.zeros((1, 4), dtype=np.float32)
-        else:
-            input_shape = inputs['image'].shape[2:]
-            im_shape = inputs['im_shape']
-            scale_factor = inputs['scale_factor']
-
-            pred_dets, pred_xyxys = self.postprocess(
-                boxes, input_shape, im_shape, scale_factor, threshold, scaled)
-
-        if add_timer:
-            self.det_times.postprocess_time_s.end()
-            self.det_times.img_num += 1
-        return pred_dets, pred_xyxys
-
-
-class SDE_DetectorPicoDet(DetectorPicoDet):
-    """
-    Args:
-        pred_config (object): config of model, defined by `Config(model_dir)`
-        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
-        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
-        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
-        trt_min_shape (int): min shape for dynamic shape in trt
-        trt_max_shape (int): max shape for dynamic shape in trt
-        trt_opt_shape (int): opt shape for dynamic shape in trt
-        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
-            calibration, trt_calib_mode need to set True
-        cpu_threads (int): cpu threads
-        enable_mkldnn (bool): whether to open MKLDNN
-    """
-
-    def __init__(self,
-                 pred_config,
-                 model_dir,
-                 device='CPU',
-                 run_mode='paddle',
-                 batch_size=1,
-                 trt_min_shape=1,
-                 trt_max_shape=1088,
-                 trt_opt_shape=608,
-                 trt_calib_mode=False,
-                 cpu_threads=1,
-                 enable_mkldnn=False):
-        super(SDE_DetectorPicoDet, self).__init__(
-            pred_config=pred_config,
-            model_dir=model_dir,
-            device=device,
-            run_mode=run_mode,
-            batch_size=batch_size,
-            trt_min_shape=trt_min_shape,
-            trt_max_shape=trt_max_shape,
-            trt_opt_shape=trt_opt_shape,
-            trt_calib_mode=trt_calib_mode,
-            cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn)
-        assert batch_size == 1, "The JDE Detector only supports batch size=1 now"
-        self.pred_config = pred_config
-
-    def postprocess_bboxes(self, boxes, input_shape, im_shape, scale_factor,
-                           threshold):
-        over_thres_idx = np.nonzero(boxes[:, 1:2] >= threshold)[0]
-        if len(over_thres_idx) == 0:
-            pred_dets = np.zeros((1, 6), dtype=np.float32)
-            pred_xyxys = np.zeros((1, 4), dtype=np.float32)
-            return pred_dets, pred_xyxys
-        else:
-            boxes = boxes[over_thres_idx]
-
-        pred_bboxes = boxes[:, 2:]
-
-        pred_xyxys, keep_idx = clip_box(pred_bboxes, input_shape, im_shape,
-                                        scale_factor)
-        if len(keep_idx[0]) == 0:
-            pred_dets = np.zeros((1, 6), dtype=np.float32)
-            pred_xyxys = np.zeros((1, 4), dtype=np.float32)
-            return pred_dets, pred_xyxys
-
-        pred_scores = boxes[:, 1:2][keep_idx[0]]
-        pred_cls_ids = boxes[:, 0:1][keep_idx[0]]
-        pred_tlwhs = np.concatenate(
-            (pred_xyxys[:, 0:2], pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1),
-            axis=1)
-
-        pred_dets = np.concatenate(
-            (pred_tlwhs, pred_scores, pred_cls_ids), axis=1)
-        return pred_dets, pred_xyxys
-
-    def predict(self, image, scaled, threshold=0.5, repeats=1, add_timer=True):
-        '''
-        Args:
-            image (np.ndarray): image numpy data
-            scaled (bool): whether the coords after detector outputs are scaled,
-                default False in jde yolov3, set True in general detector.
-            threshold (float): threshold of predicted box' score
-            repeats (int): repeat number for prediction
-            add_timer (bool): whether add timer during prediction
-           
-        Returns:
-            pred_dets (np.ndarray, [N, 6])
-        '''
-        # preprocess
-        if add_timer:
-            self.det_times.preprocess_time_s.start()
-        inputs = self.preprocess(image)
-
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(inputs[input_names[i]])
-
-        if add_timer:
-            self.det_times.preprocess_time_s.end()
-            self.det_times.inference_time_s.start()
-
-        # model prediction
-        np_score_list, np_boxes_list = [], []
-        for i in range(repeats):
-            self.predictor.run()
-            np_score_list.clear()
-            np_boxes_list.clear()
-            output_names = self.predictor.get_output_names()
-            num_outs = int(len(output_names) / 2)
-            for out_idx in range(num_outs):
-                np_score_list.append(
-                    self.predictor.get_output_handle(output_names[out_idx])
-                    .copy_to_cpu())
-                np_boxes_list.append(
-                    self.predictor.get_output_handle(output_names[
-                        out_idx + num_outs]).copy_to_cpu())
-
-        if add_timer:
-            self.det_times.inference_time_s.end(repeats=repeats)
-            self.det_times.img_num += 1
-            self.det_times.postprocess_time_s.start()
-
-        # postprocess
-        self.postprocess = PicoDetPostProcess(
-            inputs['image'].shape[2:],
-            inputs['im_shape'],
-            inputs['scale_factor'],
-            strides=self.pred_config.fpn_stride,
-            nms_threshold=self.pred_config.nms['nms_threshold'])
-        boxes, boxes_num = self.postprocess(np_score_list, np_boxes_list)
-
-        if len(boxes) == 0:
-            pred_dets = np.zeros((1, 6), dtype=np.float32)
-            pred_xyxys = np.zeros((1, 4), dtype=np.float32)
-        else:
-            input_shape = inputs['image'].shape[2:]
-            im_shape = inputs['im_shape']
-            scale_factor = inputs['scale_factor']
-            pred_dets, pred_xyxys = self.postprocess_bboxes(
-                boxes, input_shape, im_shape, scale_factor, threshold)
-        if add_timer:
-            self.det_times.postprocess_time_s.end()
-        return pred_dets, pred_xyxys
-
-
-class SDE_ReID(object):
-    def __init__(self,
-                 pred_config,
-                 model_dir,
-                 device='CPU',
-                 run_mode='paddle',
-                 batch_size=50,
-                 trt_min_shape=1,
-                 trt_max_shape=1088,
-                 trt_opt_shape=608,
-                 trt_calib_mode=False,
-                 cpu_threads=1,
-                 enable_mkldnn=False):
-        self.pred_config = pred_config
-        self.predictor, self.config = load_predictor(
-            model_dir,
-            run_mode=run_mode,
-            batch_size=batch_size,
-            min_subgraph_size=self.pred_config.min_subgraph_size,
-            device=device,
-            use_dynamic_shape=self.pred_config.use_dynamic_shape,
-            trt_min_shape=trt_min_shape,
-            trt_max_shape=trt_max_shape,
-            trt_opt_shape=trt_opt_shape,
-            trt_calib_mode=trt_calib_mode,
-            cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn)
-        self.det_times = Timer()
-        self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
-        self.batch_size = batch_size
-        assert pred_config.tracker, "Tracking model should have tracker"
-        pt = pred_config.tracker
-        max_age = pt['max_age'] if 'max_age' in pt else 30
-        max_iou_distance = pt[
-            'max_iou_distance'] if 'max_iou_distance' in pt else 0.7
-        self.tracker = DeepSORTTracker(
-            max_age=max_age, max_iou_distance=max_iou_distance)
-
-    def get_crops(self, xyxy, ori_img):
-        w, h = self.tracker.input_size
-        self.det_times.preprocess_time_s.start()
-        crops = []
-        xyxy = xyxy.astype(np.int64)
-        ori_img = ori_img.transpose(1, 0, 2)  # [h,w,3]->[w,h,3]
-        for i, bbox in enumerate(xyxy):
-            crop = ori_img[bbox[0]:bbox[2], bbox[1]:bbox[3], :]
-            crops.append(crop)
-        crops = preprocess_reid(crops, w, h)
-        self.det_times.preprocess_time_s.end()
-
-        return crops
-
-    def preprocess(self, crops):
-        # to keep fast speed, only use topk crops
-        crops = crops[:self.batch_size]
-        inputs = {}
-        inputs['crops'] = np.array(crops).astype('float32')
-        return inputs
-
-    def postprocess(self, pred_dets, pred_embs):
-        tracker = self.tracker
-        tracker.predict()
-        online_targets = tracker.update(pred_dets, pred_embs)
-
-        online_tlwhs, online_scores, online_ids = [], [], []
-        for t in online_targets:
-            if not t.is_confirmed() or t.time_since_update > 1:
-                continue
-            tlwh = t.to_tlwh()
-            tscore = t.score
-            tid = t.track_id
-            if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
-            if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
-                    3] > tracker.vertical_ratio:
-                continue
-            online_tlwhs.append(tlwh)
-            online_scores.append(tscore)
-            online_ids.append(tid)
-
-        return online_tlwhs, online_scores, online_ids
-
-    def predict(self, crops, pred_dets, repeats=1, add_timer=True):
-        # preprocess
-        if add_timer:
-            self.det_times.preprocess_time_s.start()
-        inputs = self.preprocess(crops)
-
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            input_tensor.copy_from_cpu(inputs[input_names[i]])
-        if add_timer:
-            self.det_times.preprocess_time_s.end()
-            self.det_times.inference_time_s.start()
-
-        # model prediction
-        for i in range(repeats):
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            feature_tensor = self.predictor.get_output_handle(output_names[0])
-            pred_embs = feature_tensor.copy_to_cpu()
-        if add_timer:
-            self.det_times.inference_time_s.end(repeats=repeats)
-            self.det_times.postprocess_time_s.start()
-
-        # postprocess
-        online_tlwhs, online_scores, online_ids = self.postprocess(pred_dets,
-                                                                   pred_embs)
-        if add_timer:
-            self.det_times.postprocess_time_s.end()
-            self.det_times.img_num += 1
+            (pred_dets[:, 2:], pred_dets[:, 1:2], pred_dets[:, 0:1]), 1)
+        # pred_dets should be 'x0, y0, x1, y1, score, cls_id'
+
+        online_targets_dict = self.tracker.update(pred_dets, pred_embs)
+        online_tlwhs = defaultdict(list)
+        online_scores = defaultdict(list)
+        online_ids = defaultdict(list)
+        for cls_id in range(self.num_classes):
+            online_targets = online_targets_dict[cls_id]
+            for t in online_targets:
+                tlwh = t.tlwh
+                tid = t.track_id
+                tscore = t.score
+                if tlwh[2] * tlwh[3] <= self.tracker.min_box_area:
+                    continue
+                if self.tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
+                        3] > self.tracker.vertical_ratio:
+                    continue
+                online_tlwhs[cls_id].append(tlwh)
+                online_ids[cls_id].append(tid)
+                online_scores[cls_id].append(tscore)
 
         return online_tlwhs, online_scores, online_ids
 
+    def predict_image(self,
+                      image_list,
+                      run_benchmark=False,
+                      repeats=1,
+                      visual=True):
+        mot_results = []
+        num_classes = self.num_classes
+        image_list.sort()
+        ids2names = self.pred_config.labels
+        for frame_id, img_file in enumerate(image_list):
+            batch_image_list = [img_file]  # bs=1 in MOT model
+            if run_benchmark:
+                # preprocess
+                inputs = self.preprocess(batch_image_list)  # warmup
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                result_warmup = self.predict(repeats=repeats)  # warmup
+                self.det_times.inference_time_s.start()
+                result = self.predict(repeats=repeats)
+                self.det_times.inference_time_s.end(repeats=repeats)
+
+                # postprocess
+                result_warmup = self.postprocess(inputs, result)  # warmup
+                self.det_times.postprocess_time_s.start()
+                det_result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+
+                # tracking
+                result_warmup = self.tracking(det_result)
+                self.det_times.tracking_time_s.start()
+                online_tlwhs, online_scores, online_ids = self.tracking(
+                    det_result)
+                self.det_times.tracking_time_s.end()
+                self.det_times.img_num += 1
+
+                cm, gm, gu = get_current_memory_mb()
+                self.cpu_mem += cm
+                self.gpu_mem += gm
+                self.gpu_util += gu
 
-def predict_image(detector, reid_model, image_list):
-    image_list.sort()
-    for i, img_file in enumerate(image_list):
-        frame = cv2.imread(img_file)
-        if FLAGS.run_benchmark:
-            # warmup
-            pred_dets, pred_xyxys = detector.predict(
-                [frame],
-                FLAGS.scaled,
-                FLAGS.threshold,
-                repeats=10,
-                add_timer=True)
-            # run benchmark
-            pred_dets, pred_xyxys = detector.predict(
-                [frame],
-                FLAGS.scaled,
-                FLAGS.threshold,
-                repeats=10,
-                add_timer=True)
-            cm, gm, gu = get_current_memory_mb()
-            detector.cpu_mem += cm
-            detector.gpu_mem += gm
-            detector.gpu_util += gu
-            print('Test iter {}, file name:{}'.format(i, img_file))
-        else:
-            pred_dets, pred_xyxys = detector.predict([frame], FLAGS.scaled,
-                                                     FLAGS.threshold)
-
-        if len(pred_dets) == 1 and np.sum(pred_dets) == 0:
-            print('Frame {} has no object, try to modify score threshold.'.
-                  format(i))
-            online_im = frame
-        else:
-            # reid process
-            crops = reid_model.get_crops(pred_xyxys, frame)
-
-            if FLAGS.run_benchmark:
-                # warmup
-                online_tlwhs, online_scores, online_ids = reid_model.predict(
-                    crops, pred_dets, repeats=10, add_timer=False)
-                # run benchmark
-                online_tlwhs, online_scores, online_ids = reid_model.predict(
-                    crops, pred_dets, repeats=10, add_timer=False)
             else:
-                online_tlwhs, online_scores, online_ids = reid_model.predict(
-                    crops, pred_dets)
-                online_im = plot_tracking(
-                    frame, online_tlwhs, online_ids, online_scores, frame_id=i)
-
-        if FLAGS.save_images:
-            if not os.path.exists(FLAGS.output_dir):
-                os.makedirs(FLAGS.output_dir)
-            img_name = os.path.split(img_file)[-1]
-            out_path = os.path.join(FLAGS.output_dir, img_name)
-            cv2.imwrite(out_path, online_im)
-            print("save result to: " + out_path)
-
-
-def predict_video(detector, reid_model, camera_id):
-    if camera_id != -1:
-        capture = cv2.VideoCapture(camera_id)
-        video_name = 'mot_output.mp4'
-    else:
-        capture = cv2.VideoCapture(FLAGS.video_file)
-        video_name = os.path.split(FLAGS.video_file)[-1]
-    # Get Video info : resolution, fps, frame count
-    width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = int(capture.get(cv2.CAP_PROP_FPS))
-    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
-    print("fps: %d, frame_count: %d" % (fps, frame_count))
-
-    if not os.path.exists(FLAGS.output_dir):
-        os.makedirs(FLAGS.output_dir)
-    out_path = os.path.join(FLAGS.output_dir, video_name)
-    if not FLAGS.save_images:
-        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
-        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
-    frame_id = 0
-    timer = MOTTimer()
-    results = defaultdict(list)
-    while (1):
-        ret, frame = capture.read()
-        if not ret:
-            break
-        timer.tic()
-        pred_dets, pred_xyxys = detector.predict([frame], FLAGS.scaled,
-                                                 FLAGS.threshold)
-
-        if len(pred_dets) == 1 and np.sum(pred_dets) == 0:
-            print('Frame {} has no object, try to modify score threshold.'.
-                  format(frame_id))
-            timer.toc()
-            im = frame
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                self.det_times.inference_time_s.start()
+                result = self.predict()
+                self.det_times.inference_time_s.end()
+
+                self.det_times.postprocess_time_s.start()
+                det_result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+
+                # tracking process
+                self.det_times.tracking_time_s.start()
+                online_tlwhs, online_scores, online_ids = self.tracking(
+                    det_result)
+                self.det_times.tracking_time_s.end()
+                self.det_times.img_num += 1
+
+            if visual:
+                if frame_id % 10 == 0:
+                    print('Tracking frame {}'.format(frame_id))
+                frame, _ = decode_image(img_file, {})
+
+                im = plot_tracking_dict(
+                    frame,
+                    num_classes,
+                    online_tlwhs,
+                    online_ids,
+                    online_scores,
+                    frame_id=frame_id,
+                    ids2names=[])
+                seq_name = image_list[0].split('/')[-2]
+                save_dir = os.path.join(self.output_dir, seq_name)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir)
+                cv2.imwrite(
+                    os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), im)
+
+            mot_results.append([online_tlwhs, online_scores, online_ids])
+        return mot_results
+
+    def predict_video(self, video_file, camera_id):
+        video_out_name = 'output.mp4'
+        if camera_id != -1:
+            capture = cv2.VideoCapture(camera_id)
         else:
-            # reid process
-            crops = reid_model.get_crops(pred_xyxys, frame)
-            online_tlwhs, online_scores, online_ids = reid_model.predict(
-                crops, pred_dets)
-            results[0].append(
-                (frame_id + 1, online_tlwhs, online_scores, online_ids))
+            capture = cv2.VideoCapture(video_file)
+            video_out_name = os.path.split(video_file)[-1]
+        # Get Video info : resolution, fps, frame count
+        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = int(capture.get(cv2.CAP_PROP_FPS))
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        print("fps: %d, frame_count: %d" % (fps, frame_count))
+
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+        out_path = os.path.join(self.output_dir, video_out_name)
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+
+        frame_id = 1
+        timer = MOTTimer()
+        results = defaultdict(list)  # support single class and multi classes
+        num_classes = self.num_classes
+        while (1):
+            ret, frame = capture.read()
+            if not ret:
+                break
+            if frame_id % 10 == 0:
+                print('Tracking frame: %d' % (frame_id))
+            frame_id += 1
+
+            timer.tic()
+            mot_results = self.predict_image([frame], visual=False)
             timer.toc()
 
-            fps = 1. / timer.average_time
-            im = plot_tracking(
+            online_tlwhs, online_scores, online_ids = mot_results[0]
+            for cls_id in range(num_classes):
+                results[cls_id].append(
+                    (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id],
+                     online_ids[cls_id]))
+
+            fps = 1. / timer.duration
+            im = plot_tracking_dict(
                 frame,
+                num_classes,
                 online_tlwhs,
                 online_ids,
                 online_scores,
                 frame_id=frame_id,
-                fps=fps)
-
-        if FLAGS.save_images:
-            save_dir = os.path.join(FLAGS.output_dir, video_name.split('.')[-2])
-            if not os.path.exists(save_dir):
-                os.makedirs(save_dir)
-            cv2.imwrite(
-                os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), im)
-        else:
-            writer.write(im)
-
-        frame_id += 1
-        print('detect frame:%d' % (frame_id))
-
-        if camera_id != -1:
-            cv2.imshow('Tracking Detection', im)
-            if cv2.waitKey(1) & 0xFF == ord('q'):
-                break
+                fps=fps,
+                ids2names=[])
 
-    if FLAGS.save_mot_txts:
-        result_filename = os.path.join(FLAGS.output_dir,
-                                       video_name.split('.')[-2] + '.txt')
-        write_mot_results(result_filename, results)
-
-    if FLAGS.save_images:
-        save_dir = os.path.join(FLAGS.output_dir, video_name.split('.')[-2])
-        cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(save_dir,
-                                                              out_path)
-        os.system(cmd_str)
-        print('Save video in {}.'.format(out_path))
-    else:
+            writer.write(im)
+            if camera_id != -1:
+                cv2.imshow('Mask Detection', im)
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
         writer.release()
 
 
 def main():
-    pred_config = PredictConfig(FLAGS.model_dir)
-    detector_func = 'SDE_Detector'
-    if pred_config.arch == 'PicoDet':
-        detector_func = 'SDE_DetectorPicoDet'
-
-    detector = eval(detector_func)(pred_config,
-                                   FLAGS.model_dir,
-                                   device=FLAGS.device,
-                                   run_mode=FLAGS.run_mode,
-                                   batch_size=FLAGS.batch_size,
-                                   trt_min_shape=FLAGS.trt_min_shape,
-                                   trt_max_shape=FLAGS.trt_max_shape,
-                                   trt_opt_shape=FLAGS.trt_opt_shape,
-                                   trt_calib_mode=FLAGS.trt_calib_mode,
-                                   cpu_threads=FLAGS.cpu_threads,
-                                   enable_mkldnn=FLAGS.enable_mkldnn)
-
-    pred_config = PredictConfig(FLAGS.reid_model_dir)
-    reid_model = SDE_ReID(
-        pred_config,
-        FLAGS.reid_model_dir,
+    deploy_file = os.path.join(FLAGS.model_dir, 'infer_cfg.yml')
+    with open(deploy_file) as f:
+        yml_conf = yaml.safe_load(f)
+    arch = yml_conf['arch']
+    assert arch in MOT_SDE_SUPPORT_MODELS, '{} is not supported.'.format(arch)
+    detector = SDE_Detector(
+        FLAGS.model_dir,
+        FLAGS.tracker_config,
         device=FLAGS.device,
         run_mode=FLAGS.run_mode,
-        batch_size=FLAGS.reid_batch_size,
+        batch_size=FLAGS.batch_size,
         trt_min_shape=FLAGS.trt_min_shape,
         trt_max_shape=FLAGS.trt_max_shape,
         trt_opt_shape=FLAGS.trt_opt_shape,
         trt_calib_mode=FLAGS.trt_calib_mode,
         cpu_threads=FLAGS.cpu_threads,
-        enable_mkldnn=FLAGS.enable_mkldnn)
+        enable_mkldnn=FLAGS.enable_mkldnn,
+        threshold=FLAGS.threshold,
+        output_dir=FLAGS.output_dir)
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:
-        predict_video(detector, reid_model, FLAGS.camera_id)
+        detector.predict_video(FLAGS.video_file, FLAGS.camera_id)
     else:
         # predict from image
+        if FLAGS.image_dir is None and FLAGS.image_file is not None:
+            assert FLAGS.batch_size == 1, "--batch_size should be 1 in MOT models."
         img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        predict_image(detector, reid_model, img_list)
+        detector.predict_image(img_list, FLAGS.run_benchmark, repeats=10)
 
         if not FLAGS.run_benchmark:
             detector.det_times.info(average=True)
-            reid_model.det_times.info(average=True)
         else:
             mode = FLAGS.run_mode
-            det_model_dir = FLAGS.model_dir
-            det_model_info = {
-                'model_name': det_model_dir.strip('/').split('/')[-1],
-                'precision': mode.split('_')[-1]
-            }
-            bench_log(detector, img_list, det_model_info, name='Det')
-
-            reid_model_dir = FLAGS.reid_model_dir
-            reid_model_info = {
-                'model_name': reid_model_dir.strip('/').split('/')[-1],
+            model_dir = FLAGS.model_dir
+            model_info = {
+                'model_name': model_dir.strip('/').split('/')[-1],
                 'precision': mode.split('_')[-1]
             }
-            bench_log(reid_model, img_list, reid_model_info, name='ReID')
+            bench_log(detector, img_list, model_info, name='MOT')
 
 
 if __name__ == '__main__':
diff --git a/deploy/python/tracker_config.yml b/deploy/python/tracker_config.yml
new file mode 100644
index 000000000..d92510148
--- /dev/null
+++ b/deploy/python/tracker_config.yml
@@ -0,0 +1,10 @@
+# config of tracker for MOT SDE Detector, use ByteTracker as default.
+# The tracker of MOT JDE Detector is exported together with the model.
+# Here 'min_box_area' and 'vertical_ratio' are set for pedestrian, you can modify for other objects tracking.
+tracker:
+  use_byte: true
+  conf_thres: 0.6
+  low_conf_thres: 0.1
+  match_thres: 0.9
+  min_box_area: 100
+  vertical_ratio: 1.6
diff --git a/deploy/python/utils.py b/deploy/python/utils.py
index 8227e282f..1ab86b1f4 100644
--- a/deploy/python/utils.py
+++ b/deploy/python/utils.py
@@ -118,6 +118,8 @@ def argsparser():
         default=False,
         help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 "
         "True in general detector.")
+    parser.add_argument(
+        "--tracker_config", type=str, default=None, help=("tracker donfig"))
     parser.add_argument(
         "--reid_model_dir",
         type=str,
@@ -165,29 +167,36 @@ class Times(object):
 
 
 class Timer(Times):
-    def __init__(self):
+    def __init__(self, with_tracker=False):
         super(Timer, self).__init__()
+        self.with_tracker = with_tracker
         self.preprocess_time_s = Times()
         self.inference_time_s = Times()
         self.postprocess_time_s = Times()
+        self.tracking_time_s = Times()
         self.img_num = 0
 
     def info(self, average=False):
-        total_time = self.preprocess_time_s.value(
-        ) + self.inference_time_s.value() + self.postprocess_time_s.value()
+        pre_time = self.preprocess_time_s.value()
+        infer_time = self.inference_time_s.value()
+        post_time = self.postprocess_time_s.value()
+        track_time = self.tracking_time_s.value()
+
+        total_time = pre_time + infer_time + post_time
+        if self.with_tracker:
+            total_time = total_time + track_time
         total_time = round(total_time, 4)
         print("------------------ Inference Time Info ----------------------")
         print("total_time(ms): {}, img_num: {}".format(total_time * 1000,
                                                        self.img_num))
-        preprocess_time = round(
-            self.preprocess_time_s.value() / max(1, self.img_num),
-            4) if average else self.preprocess_time_s.value()
-        postprocess_time = round(
-            self.postprocess_time_s.value() / max(1, self.img_num),
-            4) if average else self.postprocess_time_s.value()
-        inference_time = round(self.inference_time_s.value() /
-                               max(1, self.img_num),
-                               4) if average else self.inference_time_s.value()
+        preprocess_time = round(pre_time / max(1, self.img_num),
+                                4) if average else pre_time
+        postprocess_time = round(post_time / max(1, self.img_num),
+                                 4) if average else post_time
+        inference_time = round(infer_time / max(1, self.img_num),
+                               4) if average else infer_time
+        tracking_time = round(track_time / max(1, self.img_num),
+                              4) if average else track_time
 
         average_latency = total_time / max(1, self.img_num)
         qps = 0
@@ -195,25 +204,36 @@ class Timer(Times):
             qps = 1 / average_latency
         print("average latency time(ms): {:.2f}, QPS: {:2f}".format(
             average_latency * 1000, qps))
-        print(
-            "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".
-            format(preprocess_time * 1000, inference_time * 1000,
-                   postprocess_time * 1000))
+        if self.with_tracker:
+            print(
+                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".
+                format(preprocess_time * 1000, inference_time * 1000,
+                       postprocess_time * 1000, tracking_time * 1000))
+        else:
+            print(
+                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".
+                format(preprocess_time * 1000, inference_time * 1000,
+                       postprocess_time * 1000))
 
     def report(self, average=False):
         dic = {}
-        dic['preprocess_time_s'] = round(
-            self.preprocess_time_s.value() / max(1, self.img_num),
-            4) if average else self.preprocess_time_s.value()
-        dic['postprocess_time_s'] = round(
-            self.postprocess_time_s.value() / max(1, self.img_num),
-            4) if average else self.postprocess_time_s.value()
-        dic['inference_time_s'] = round(
-            self.inference_time_s.value() / max(1, self.img_num),
-            4) if average else self.inference_time_s.value()
+        pre_time = self.preprocess_time_s.value()
+        infer_time = self.inference_time_s.value()
+        post_time = self.postprocess_time_s.value()
+        track_time = self.tracking_time_s.value()
+
+        dic['preprocess_time_s'] = round(pre_time / max(1, self.img_num),
+                                         4) if average else pre_time
+        dic['inference_time_s'] = round(infer_time / max(1, self.img_num),
+                                        4) if average else infer_time
+        dic['postprocess_time_s'] = round(post_time / max(1, self.img_num),
+                                          4) if average else post_time
+        dic['tracking_time_s'] = round(post_time / max(1, self.img_num),
+                                       4) if average else track_time
         dic['img_num'] = self.img_num
-        total_time = self.preprocess_time_s.value(
-        ) + self.inference_time_s.value() + self.postprocess_time_s.value()
+        total_time = pre_time + infer_time + post_time
+        if self.with_tracker:
+            total_time = total_time + track_time
         dic['total_time_s'] = round(total_time, 4)
         return dic
 
diff --git a/deploy/python/visualize.py b/deploy/python/visualize.py
index 7e322973d..e049a4dc1 100644
--- a/deploy/python/visualize.py
+++ b/deploy/python/visualize.py
@@ -224,13 +224,13 @@ def get_color(idx):
     return color
 
 
-def draw_pose(imgfile,
-              results,
-              visual_thread=0.6,
-              save_name='pose.jpg',
-              save_dir='output',
-              returnimg=False,
-              ids=None):
+def visualize_pose(imgfile,
+                   results,
+                   visual_thresh=0.6,
+                   save_name='pose.jpg',
+                   save_dir='output',
+                   returnimg=False,
+                   ids=None):
     try:
         import matplotlib.pyplot as plt
         import matplotlib
@@ -239,7 +239,6 @@ def draw_pose(imgfile,
         logger.error('Matplotlib not found, please install matplotlib.'
                      'for example: `pip install matplotlib`.')
         raise e
-
     skeletons, scores = results['keypoint']
     skeletons = np.array(skeletons)
     kpt_nums = 17
@@ -276,7 +275,7 @@ def draw_pose(imgfile,
     canvas = img.copy()
     for i in range(kpt_nums):
         for j in range(len(skeletons)):
-            if skeletons[j][i, 2] < visual_thread:
+            if skeletons[j][i, 2] < visual_thresh:
                 continue
             if ids is None:
                 color = colors[i] if color_set is None else colors[color_set[j]
@@ -300,8 +299,8 @@ def draw_pose(imgfile,
     for i in range(NUM_EDGES):
         for j in range(len(skeletons)):
             edge = EDGES[i]
-            if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[
-                    1], 2] < visual_thread:
+            if skeletons[j][edge[0], 2] < visual_thresh or skeletons[j][edge[
+                    1], 2] < visual_thresh:
                 continue
 
             cur_canvas = canvas.copy()
-- 
GitLab