Develop branch: add fight action for pphuman (#6160)

* add fight for PP-Human * add short_size and target_size for fight recognition * add short_size and target_size for fight_infer * modify code according to the reviews * add the wrong deleted lines` * Update pipeline.py * Update infer_cfg.yml * visualize fight when fight action occur * 乱码修改 * delete useless parmas * delete useless code str2bool

Develop branch: add fight action for pphuman (#6160)
* add fight for PP-Human * add short_size and target_size for fight recognition * add short_size and target_size for fight_infer * modify code according to the reviews * add the wrong deleted lines` * Update pipeline.py * Update infer_cfg.yml * visualize fight when fight action occur * 乱码修改 * delete useless parmas * delete useless code str2bool
67f16ed9 · XYZ_916 · GitHub · ed331ba2 · 67f16ed9 · 67f16ed9
7 changed file
--- a/deploy/pphuman/config/infer_cfg.yml
+++ b/deploy/pphuman/config/infer_cfg.yml
@@ -25,8 +25,13 @@ ATTR:
  enable: False
 VIDEO_ACTION:
-  model_dir: output_inference/pp-stm
+  model_dir: output_inference/ppTSM
  batch_size: 1
+  frame_len: 8
+  sample_freq: 7
+  short_size: 340
+  target_size: 320
+  basemode: "videobased"
  enable: False
 SKELETON_ACTION:

--- a/deploy/pphuman/datacollector.py
+++ b/deploy/pphuman/datacollector.py
@@ -23,6 +23,7 @@ class Result(object):
            'mot': dict(),
            'attr': dict(),
            'kpt': dict(),
+            'video_action': dict(),
            'skeleton_action': dict(),
            'reid': dict()
        }

--- a/deploy/pphuman/pipe_utils.py
+++ b/deploy/pphuman/pipe_utils.py
@@ -152,6 +152,7 @@ class PipeTimer(Times):
            'mot': Times(),
            'attr': Times(),
            'kpt': Times(),
+            'video_action': Times(),
            'skeleton_action': Times(),
            'reid': Times()
        }
@@ -197,6 +198,7 @@ class PipeTimer(Times):
        dic['kpt'] = round(self.module_time['kpt'].value() /
                           max(1, self.img_num),
                           4) if average else self.module_time['kpt'].value()
+        dic['video_action'] = self.module_time['video_action'].value()
        dic['skeleton_action'] = round(
            self.module_time['skeleton_action'].value() / max(1, self.img_num),
            4) if average else self.module_time['skeleton_action'].value()

--- a/deploy/pphuman/pipeline.py
+++ b/deploy/pphuman/pipeline.py
@@ -36,6 +36,7 @@ from python.infer import Detector, DetectorPicoDet
 from python.attr_infer import AttrDetector
 from python.keypoint_infer import KeyPointDetector
 from python.keypoint_postprocess import translate_to_ori_images
+from python.video_action_infer import VideoActionRecognizer
 from python.action_infer import SkeletonActionRecognizer
 from python.action_utils import KeyPointBuff, SkeletonActionVisualHelper
@@ -75,7 +76,7 @@ class Pipeline(object):
        draw_center_traj (bool): Whether drawing the trajectory of center, default as False
        secs_interval (int): The seconds interval to count after tracking, default as 10
        do_entrance_counting(bool): Whether counting the numbers of identifiers entering 
-            or getting out from the entrance, default as False，only support single class
+            or getting out from the entrance, default as False, only support single class
            counting in MOT.
    """
@@ -181,7 +182,7 @@ class Pipeline(object):
        else:
            raise ValueError(
-                "Illegal Input, please set one of ['video_file'，'camera_id'，'image_file', 'image_dir']"
+                "Illegal Input, please set one of ['video_file', 'camera_id', 'image_file', 'image_dir']"
            )
        return input
@@ -218,6 +219,7 @@ class PipePredictor(object):
        1. Tracking
        2. Tracking -> Attribute
        3. Tracking -> KeyPoint -> SkeletonAction Recognition
+        4. VideoAction Recognition
    Args:
        cfg (dict): config of models in pipeline
@@ -240,7 +242,7 @@ class PipePredictor(object):
        draw_center_traj (bool): Whether drawing the trajectory of center, default as False
        secs_interval (int): The seconds interval to count after tracking, default as 10
        do_entrance_counting(bool): Whether counting the numbers of identifiers entering 
-            or getting out from the entrance, default as False，only support single class
+            or getting out from the entrance, default as False, only support single class
            counting in MOT.
    """
@@ -277,6 +279,7 @@ class PipePredictor(object):
                'ID_BASED_CLSACTION', False) else False
        self.with_mtmct = cfg.get('REID', False)['enable'] if cfg.get(
            'REID', False) else False
        if self.with_attr:
            print('Attribute Recognition enabled')
        if self.with_skeleton_action:
@@ -296,6 +299,7 @@ class PipePredictor(object):
            "idbased": False,
            "skeletonbased": False
        }
        self.is_video = is_video
        self.multi_camera = multi_camera
        self.cfg = cfg
@@ -416,6 +420,31 @@ class PipePredictor(object):
                        use_dark=False)
                    self.kpt_buff = KeyPointBuff(skeleton_action_frames)
+            if self.with_video_action:
+                video_action_cfg = self.cfg['VIDEO_ACTION']
+                basemode = video_action_cfg['basemode']
+                self.modebase[basemode] = True
+                video_action_model_dir = video_action_cfg['model_dir']
+                video_action_batch_size = video_action_cfg['batch_size']
+                short_size = video_action_cfg["short_size"]
+                target_size = video_action_cfg["target_size"]
+                self.video_action_predictor = VideoActionRecognizer(
+                    model_dir=video_action_model_dir,
+                    short_size=short_size,
+                    target_size=target_size,
+                    device=device,
+                    run_mode=run_mode,
+                    batch_size=video_action_batch_size,
+                    trt_min_shape=trt_min_shape,
+                    trt_max_shape=trt_max_shape,
+                    trt_opt_shape=trt_opt_shape,
+                    trt_calib_mode=trt_calib_mode,
+                    cpu_threads=cpu_threads,
+                    enable_mkldnn=enable_mkldnn)
        if self.with_mtmct:
            reid_cfg = self.cfg['REID']
            model_dir = reid_cfg['model_dir']
@@ -523,9 +552,12 @@ class PipePredictor(object):
        entrance = [0, height / 2., width, height / 2.]
        video_fps = fps
+        video_action_imgs = []
        while (1):
            if frame_id % 10 == 0:
                print('frame id: ', frame_id)
            ret, frame = capture.read()
            if not ret:
                break
@@ -660,10 +692,34 @@ class PipePredictor(object):
                    self.pipeline_res.clear('reid')
            if self.with_video_action:
-                #predeal, get what your model need
+                # get the params
-                #predict, model preprocess\run\postprocess
+                frame_len = self.cfg["VIDEO_ACTION"]["frame_len"]
-                #postdeal, interact with pipeline
+                sample_freq = self.cfg["VIDEO_ACTION"]["sample_freq"]
-                pass
+                if sample_freq * frame_len > frame_count:  # video is too short
+                    sample_freq = int(frame_count / frame_len)
+                # filter the warmup frames
+                if frame_id > self.warmup_frame:
+                    self.pipe_timer.module_time['video_action'].start()
+                # collect frames
+                if frame_id % sample_freq == 0:
+                    video_action_imgs.append(frame)
+                # the number of collected frames is enough to predict video action
+                if len(video_action_imgs) == frame_len:
+                    classes, scores = self.video_action_predictor.predict(
+                        video_action_imgs)
+                    if frame_id > self.warmup_frame:
+                        self.pipe_timer.module_time['video_action'].end()
+                    video_action_res = {"class": classes[0], "score": scores[0]}
+                    self.pipeline_res.update(video_action_res, 'video_action')
+                    print("video_action_res:", video_action_res)
+                    video_action_imgs.clear()  # next clip
            self.collector.append(frame_id, self.pipeline_res)
@@ -744,10 +800,21 @@ class PipePredictor(object):
                returnimg=True)
        skeleton_action_res = result.get('skeleton_action')
-        if skeleton_action_res is not None:
+        video_action_res = result.get('video_action')
-            image = visualize_action(image, mot_res['boxes'],
+        if skeleton_action_res is not None or video_action_res is not None:
-                                     self.skeleton_action_visual_helper,
+            video_action_score = None
-                                     "SkeletonAction")
+            action_visual_helper = None
+            if video_action_res and video_action_res["class"] == 1:
+                video_action_score = video_action_res["score"]
+            if skeleton_action_res:
+                action_visual_helper = self.skeleton_action_visual_helper
+            image = visualize_action(
+                image,
+                mot_res['boxes'],
+                action_visual_collector=action_visual_helper,
+                action_text="SkeletonAction",
+                video_action_score=video_action_score,
+                video_action_text="Fight")
        return image
@@ -784,6 +851,7 @@ class PipePredictor(object):
 def main():
    cfg = merge_cfg(FLAGS)
    print_arguments(cfg)
    pipeline = Pipeline(
        cfg, FLAGS.image_file, FLAGS.image_dir, FLAGS.video_file,
        FLAGS.video_dir, FLAGS.camera_id, FLAGS.device, FLAGS.run_mode,

--- a/deploy/python/video_action_infer.py
+++ b/deploy/python/video_action_infer.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import yaml
+import glob
+import cv2
+import numpy as np
+import math
+import paddle
+import sys
+from collections import Sequence
+import paddle.nn.functional as F
+# add deploy path of PadleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+from paddle.inference import Config, create_predictor
+from utils import argsparser, Timer, get_current_memory_mb
+from benchmark_utils import PaddleInferBenchmark
+from infer import Detector, print_arguments
+from video_action_preprocess import VideoDecoder, Sampler, Scale, CenterCrop, Normalization, Image2Array
+def softmax(x):
+    f_x = np.exp(x) / np.sum(np.exp(x))
+    return f_x
+class VideoActionRecognizer(object):
+    """
+    Args:
+        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
+        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
+        batch_size (int): size of pre batch in inference
+        trt_min_shape (int): min shape for dynamic shape in trt
+        trt_max_shape (int): max shape for dynamic shape in trt
+        trt_opt_shape (int): opt shape for dynamic shape in trt
+        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
+            calibration, trt_calib_mode need to set True
+        cpu_threads (int): cpu threads
+        enable_mkldnn (bool): whether to open MKLDNN
+    """
+    def __init__(self,
+                 model_dir,
+                 device='CPU',
+                 run_mode='paddle',
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1,
+                 batch_size=1,
+                 trt_min_shape=1,
+                 trt_max_shape=1280,
+                 trt_opt_shape=640,
+                 trt_calib_mode=False,
+                 cpu_threads=1,
+                 enable_mkldnn=False,
+                 ir_optim=True):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+        assert batch_size == 1, "VideoActionRecognizer only support batch_size=1 now."
+        self.model_dir = model_dir
+        self.device = device
+        self.run_mode = run_mode
+        self.batch_size = batch_size
+        self.trt_min_shape = trt_min_shape
+        self.trt_max_shape = trt_max_shape
+        self.trt_opt_shape = trt_opt_shape
+        self.trt_calib_mode = trt_calib_mode
+        self.cpu_threads = cpu_threads
+        self.enable_mkldnn = enable_mkldnn
+        self.ir_optim = ir_optim
+        self.recognize_times = Timer()
+        model_file_path = os.path.join(model_dir, "model.pdmodel")
+        params_file_path = os.path.join(model_dir, "model.pdiparams")
+        self.config = Config(model_file_path, params_file_path)
+        if device == "GPU" or device == "gpu":
+            self.config.enable_use_gpu(8000, 0)
+        else:
+            self.config.disable_gpu()
+        if self.enable_mkldnn:
+            # cache 10 different shapes for mkldnn to avoid memory leak
+            self.config.set_mkldnn_cache_capacity(10)
+            self.config.enable_mkldnn()
+        self.config.switch_ir_optim(self.ir_optim)  # default true
+        precision_map = {
+            'trt_int8': Config.Precision.Int8,
+            'trt_fp32': Config.Precision.Float32,
+            'trt_fp16': Config.Precision.Half
+        }
+        if run_mode in precision_map.keys():
+            self.config.enable_tensorrt_engine(
+                max_batch_size=self.batch_size,
+                precision_mode=precision_map[run_mode])
+        self.config.enable_memory_optim()
+        # use zero copy
+        self.config.switch_use_feed_fetch_ops(False)
+        self.predictor = create_predictor(self.config)
+    def preprocess_batch(self, file_list):
+        batched_inputs = []
+        for file in file_list:
+            inputs = self.preprocess(file)
+            batched_inputs.append(inputs)
+        batched_inputs = [
+            np.concatenate([item[i] for item in batched_inputs])
+            for i in range(len(batched_inputs[0]))
+        ]
+        self.input_file = file_list
+        return batched_inputs
+    def get_timer(self):
+        return self.recognize_times
+    def predict(self, input):
+        '''
+        Args:
+            input (str) or (list): video file path or image data list
+        Returns:
+            results (dict): 
+        '''
+        input_names = self.predictor.get_input_names()
+        input_tensor = self.predictor.get_input_handle(input_names[0])
+        output_names = self.predictor.get_output_names()
+        output_tensor = self.predictor.get_output_handle(output_names[0])
+        # preprocess
+        self.recognize_times.preprocess_time_s.start()
+        if type(input) == str:
+            inputs = self.preprocess_video(input)
+        else:
+            inputs = self.preprocess_frames(input)
+        self.recognize_times.preprocess_time_s.end()
+        inputs = np.expand_dims(
+            inputs, axis=0).repeat(
+                self.batch_size, axis=0).copy()
+        input_tensor.copy_from_cpu(inputs)
+        # model prediction
+        self.recognize_times.inference_time_s.start()
+        self.predictor.run()
+        self.recognize_times.inference_time_s.end()
+        output = output_tensor.copy_to_cpu()
+        # postprocess
+        self.recognize_times.postprocess_time_s.start()
+        classes, scores = self.postprocess(output)
+        self.recognize_times.postprocess_time_s.end()
+        return classes, scores
+    def preprocess_frames(self, frame_list):
+        """
+        frame_list: list, frame list
+        return: list
+        """
+        results = {}
+        results['frames_len'] = len(frame_list)
+        results["imgs"] = frame_list
+        img_mean = [0.485, 0.456, 0.406]
+        img_std = [0.229, 0.224, 0.225]
+        ops = [
+            Scale(self.short_size), CenterCrop(self.target_size), Image2Array(),
+            Normalization(img_mean, img_std)
+        ]
+        for op in ops:
+            results = op(results)
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+    def preprocess_video(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        img_mean = [0.485, 0.456, 0.406]
+        img_std = [0.229, 0.224, 0.225]
+        ops = [
+            VideoDecoder(), Sampler(
+                self.num_seg, self.seg_len, valid_mode=True),
+            Scale(self.short_size), CenterCrop(self.target_size), Image2Array(),
+            Normalization(img_mean, img_std)
+        ]
+        for op in ops:
+            results = op(results)
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+    def postprocess(self, output):
+        output = output.flatten()  # numpy.ndarray
+        output = softmax(output)
+        classes = np.argpartition(output, -self.top_k)[-self.top_k:]
+        classes = classes[np.argsort(-output[classes])]
+        scores = output[classes]
+        return classes, scores
+def main():
+    if not FLAGS.run_benchmark:
+        assert FLAGS.batch_size == 1
+        assert FLAGS.use_fp16 is False
+    else:
+        assert FLAGS.use_gpu is True
+    recognizer = VideoActionRecognizer(
+        FLAGS.model_dir,
+        short_size=FLAGS.short_size,
+        target_size=FLAGS.target_size,
+        device=FLAGS.device,
+        run_mode=FLAGS.run_mode,
+        batch_size=FLAGS.batch_size,
+        trt_min_shape=FLAGS.trt_min_shape,
+        trt_max_shape=FLAGS.trt_max_shape,
+        trt_opt_shape=FLAGS.trt_opt_shape,
+        trt_calib_mode=FLAGS.trt_calib_mode,
+        cpu_threads=FLAGS.cpu_threads,
+        enable_mkldnn=FLAGS.enable_mkldnn, )
+    if not FLAGS.run_benchmark:
+        classes, scores = recognizer.predict(FLAGS.video_file)
+        print("Current video file: {}".format(FLAGS.video_file))
+        print("\ttop-1 class: {0}".format(classes[0]))
+        print("\ttop-1 score: {0}".format(scores[0]))
+    else:
+        cm, gm, gu = get_current_memory_mb()
+        mems = {'cpu_rss_mb': cm, 'gpu_rss_mb': gm, 'gpu_util': gu * 100}
+        perf_info = recognizer.recognize_times.report()
+        model_dir = FLAGS.model_dir
+        mode = FLAGS.run_mode
+        model_info = {
+            'model_name': model_dir.strip('/').split('/')[-1],
+            'precision': mode.split('_')[-1]
+        }
+        data_info = {
+            'batch_size': FLAGS.batch_size,
+            'shape': "dynamic_shape",
+            'data_num': perf_info['img_num']
+        }
+        recognize_log = PaddleInferBenchmark(recognizer.config, model_info,
+                                             data_info, perf_info, mems)
+        recognize_log('Fight')
+if __name__ == '__main__':
+    paddle.enable_static()
+    parser = argsparser()
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    FLAGS.device = FLAGS.device.upper()
+    assert FLAGS.device in ['CPU', 'GPU', 'XPU'
+                            ], "device should be CPU, GPU or XPU"
+    main()
--- a/deploy/python/video_action_preprocess.py
+++ b/deploy/python/video_action_preprocess.py
--- a/deploy/python/visualize.py
+++ b/deploy/python/visualize.py
@@ -365,15 +365,35 @@ def visualize_attr(im, results, boxes=None):
    return im
-def visualize_action(im, mot_boxes, action_visual_collector, action_text=""):
+def visualize_action(im,
+                     mot_boxes,
+                     action_visual_collector=None,
+                     action_text="",
+                     video_action_score=None,
+                     video_action_text=""):
    im = cv2.imread(im) if isinstance(im, str) else im
-    id_detected = action_visual_collector.get_visualize_ids()
+    im_h, im_w = im.shape[:2]
    text_scale = max(1, im.shape[1] / 1600.)
-    for mot_box in mot_boxes:
+    text_thickness = 2
-        # mot_box is a format with [mot_id, class, score, xmin, ymin, w, h] 
-        if mot_box[0] in id_detected:
+    if action_visual_collector:
-            text_position = (int(mot_box[3] + mot_box[5] * 0.75),
+        id_detected = action_visual_collector.get_visualize_ids()
-                             int(mot_box[4] - 10))
+        for mot_box in mot_boxes:
-            cv2.putText(im, action_text, text_position, cv2.FONT_HERSHEY_PLAIN,
+            # mot_box is a format with [mot_id, class, score, xmin, ymin, w, h] 
-                        text_scale, (0, 0, 255), 2)
+            if mot_box[0] in id_detected:
+                text_position = (int(mot_box[3] + mot_box[5] * 0.75),
+                                 int(mot_box[4] - 10))
+                cv2.putText(im, action_text, text_position,
+                            cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), 2)
+    if video_action_score:
+        cv2.putText(
+            im,
+            video_action_text + ': %.2f' % video_action_score,
+            (int(im_w / 2), int(15 * text_scale) + 5),
+            cv2.FONT_ITALIC,
+            text_scale, (0, 0, 255),
+            thickness=text_thickness)
    return im