未验证 提交 67f16ed9 编写于 作者: XYZ_916's avatar XYZ_916 提交者: GitHub

Develop branch: add fight action for pphuman (#6160)

* add fight for PP-Human

* add short_size and target_size for fight recognition

* add short_size and target_size for fight_infer

* modify code according to the reviews

* add the wrong deleted lines`

* Update pipeline.py

* Update infer_cfg.yml

* visualize fight when fight action occur

* 乱码修改

* delete useless parmas

* delete useless code str2bool
上级 ed331ba2
......@@ -25,8 +25,13 @@ ATTR:
enable: False
VIDEO_ACTION:
model_dir: output_inference/pp-stm
model_dir: output_inference/ppTSM
batch_size: 1
frame_len: 8
sample_freq: 7
short_size: 340
target_size: 320
basemode: "videobased"
enable: False
SKELETON_ACTION:
......
......@@ -23,6 +23,7 @@ class Result(object):
'mot': dict(),
'attr': dict(),
'kpt': dict(),
'video_action': dict(),
'skeleton_action': dict(),
'reid': dict()
}
......
......@@ -152,6 +152,7 @@ class PipeTimer(Times):
'mot': Times(),
'attr': Times(),
'kpt': Times(),
'video_action': Times(),
'skeleton_action': Times(),
'reid': Times()
}
......@@ -197,6 +198,7 @@ class PipeTimer(Times):
dic['kpt'] = round(self.module_time['kpt'].value() /
max(1, self.img_num),
4) if average else self.module_time['kpt'].value()
dic['video_action'] = self.module_time['video_action'].value()
dic['skeleton_action'] = round(
self.module_time['skeleton_action'].value() / max(1, self.img_num),
4) if average else self.module_time['skeleton_action'].value()
......
......@@ -36,6 +36,7 @@ from python.infer import Detector, DetectorPicoDet
from python.attr_infer import AttrDetector
from python.keypoint_infer import KeyPointDetector
from python.keypoint_postprocess import translate_to_ori_images
from python.video_action_infer import VideoActionRecognizer
from python.action_infer import SkeletonActionRecognizer
from python.action_utils import KeyPointBuff, SkeletonActionVisualHelper
......@@ -75,7 +76,7 @@ class Pipeline(object):
draw_center_traj (bool): Whether drawing the trajectory of center, default as False
secs_interval (int): The seconds interval to count after tracking, default as 10
do_entrance_counting(bool): Whether counting the numbers of identifiers entering
or getting out from the entrance, default as Falseonly support single class
or getting out from the entrance, default as False, only support single class
counting in MOT.
"""
......@@ -181,7 +182,7 @@ class Pipeline(object):
else:
raise ValueError(
"Illegal Input, please set one of ['video_file','camera_id','image_file', 'image_dir']"
"Illegal Input, please set one of ['video_file', 'camera_id', 'image_file', 'image_dir']"
)
return input
......@@ -218,6 +219,7 @@ class PipePredictor(object):
1. Tracking
2. Tracking -> Attribute
3. Tracking -> KeyPoint -> SkeletonAction Recognition
4. VideoAction Recognition
Args:
cfg (dict): config of models in pipeline
......@@ -240,7 +242,7 @@ class PipePredictor(object):
draw_center_traj (bool): Whether drawing the trajectory of center, default as False
secs_interval (int): The seconds interval to count after tracking, default as 10
do_entrance_counting(bool): Whether counting the numbers of identifiers entering
or getting out from the entrance, default as Falseonly support single class
or getting out from the entrance, default as False, only support single class
counting in MOT.
"""
......@@ -277,6 +279,7 @@ class PipePredictor(object):
'ID_BASED_CLSACTION', False) else False
self.with_mtmct = cfg.get('REID', False)['enable'] if cfg.get(
'REID', False) else False
if self.with_attr:
print('Attribute Recognition enabled')
if self.with_skeleton_action:
......@@ -296,6 +299,7 @@ class PipePredictor(object):
"idbased": False,
"skeletonbased": False
}
self.is_video = is_video
self.multi_camera = multi_camera
self.cfg = cfg
......@@ -416,6 +420,31 @@ class PipePredictor(object):
use_dark=False)
self.kpt_buff = KeyPointBuff(skeleton_action_frames)
if self.with_video_action:
video_action_cfg = self.cfg['VIDEO_ACTION']
basemode = video_action_cfg['basemode']
self.modebase[basemode] = True
video_action_model_dir = video_action_cfg['model_dir']
video_action_batch_size = video_action_cfg['batch_size']
short_size = video_action_cfg["short_size"]
target_size = video_action_cfg["target_size"]
self.video_action_predictor = VideoActionRecognizer(
model_dir=video_action_model_dir,
short_size=short_size,
target_size=target_size,
device=device,
run_mode=run_mode,
batch_size=video_action_batch_size,
trt_min_shape=trt_min_shape,
trt_max_shape=trt_max_shape,
trt_opt_shape=trt_opt_shape,
trt_calib_mode=trt_calib_mode,
cpu_threads=cpu_threads,
enable_mkldnn=enable_mkldnn)
if self.with_mtmct:
reid_cfg = self.cfg['REID']
model_dir = reid_cfg['model_dir']
......@@ -523,9 +552,12 @@ class PipePredictor(object):
entrance = [0, height / 2., width, height / 2.]
video_fps = fps
video_action_imgs = []
while (1):
if frame_id % 10 == 0:
print('frame id: ', frame_id)
ret, frame = capture.read()
if not ret:
break
......@@ -660,10 +692,34 @@ class PipePredictor(object):
self.pipeline_res.clear('reid')
if self.with_video_action:
#predeal, get what your model need
#predict, model preprocess\run\postprocess
#postdeal, interact with pipeline
pass
# get the params
frame_len = self.cfg["VIDEO_ACTION"]["frame_len"]
sample_freq = self.cfg["VIDEO_ACTION"]["sample_freq"]
if sample_freq * frame_len > frame_count: # video is too short
sample_freq = int(frame_count / frame_len)
# filter the warmup frames
if frame_id > self.warmup_frame:
self.pipe_timer.module_time['video_action'].start()
# collect frames
if frame_id % sample_freq == 0:
video_action_imgs.append(frame)
# the number of collected frames is enough to predict video action
if len(video_action_imgs) == frame_len:
classes, scores = self.video_action_predictor.predict(
video_action_imgs)
if frame_id > self.warmup_frame:
self.pipe_timer.module_time['video_action'].end()
video_action_res = {"class": classes[0], "score": scores[0]}
self.pipeline_res.update(video_action_res, 'video_action')
print("video_action_res:", video_action_res)
video_action_imgs.clear() # next clip
self.collector.append(frame_id, self.pipeline_res)
......@@ -744,10 +800,21 @@ class PipePredictor(object):
returnimg=True)
skeleton_action_res = result.get('skeleton_action')
if skeleton_action_res is not None:
image = visualize_action(image, mot_res['boxes'],
self.skeleton_action_visual_helper,
"SkeletonAction")
video_action_res = result.get('video_action')
if skeleton_action_res is not None or video_action_res is not None:
video_action_score = None
action_visual_helper = None
if video_action_res and video_action_res["class"] == 1:
video_action_score = video_action_res["score"]
if skeleton_action_res:
action_visual_helper = self.skeleton_action_visual_helper
image = visualize_action(
image,
mot_res['boxes'],
action_visual_collector=action_visual_helper,
action_text="SkeletonAction",
video_action_score=video_action_score,
video_action_text="Fight")
return image
......@@ -784,6 +851,7 @@ class PipePredictor(object):
def main():
cfg = merge_cfg(FLAGS)
print_arguments(cfg)
pipeline = Pipeline(
cfg, FLAGS.image_file, FLAGS.image_dir, FLAGS.video_file,
FLAGS.video_dir, FLAGS.camera_id, FLAGS.device, FLAGS.run_mode,
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import yaml
import glob
import cv2
import numpy as np
import math
import paddle
import sys
from collections import Sequence
import paddle.nn.functional as F
# add deploy path of PadleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)
from paddle.inference import Config, create_predictor
from utils import argsparser, Timer, get_current_memory_mb
from benchmark_utils import PaddleInferBenchmark
from infer import Detector, print_arguments
from video_action_preprocess import VideoDecoder, Sampler, Scale, CenterCrop, Normalization, Image2Array
def softmax(x):
f_x = np.exp(x) / np.sum(np.exp(x))
return f_x
class VideoActionRecognizer(object):
"""
Args:
model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
batch_size (int): size of pre batch in inference
trt_min_shape (int): min shape for dynamic shape in trt
trt_max_shape (int): max shape for dynamic shape in trt
trt_opt_shape (int): opt shape for dynamic shape in trt
trt_calib_mode (bool): If the model is produced by TRT offline quantitative
calibration, trt_calib_mode need to set True
cpu_threads (int): cpu threads
enable_mkldnn (bool): whether to open MKLDNN
"""
def __init__(self,
model_dir,
device='CPU',
run_mode='paddle',
num_seg=8,
seg_len=1,
short_size=256,
target_size=224,
top_k=1,
batch_size=1,
trt_min_shape=1,
trt_max_shape=1280,
trt_opt_shape=640,
trt_calib_mode=False,
cpu_threads=1,
enable_mkldnn=False,
ir_optim=True):
self.num_seg = num_seg
self.seg_len = seg_len
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
assert batch_size == 1, "VideoActionRecognizer only support batch_size=1 now."
self.model_dir = model_dir
self.device = device
self.run_mode = run_mode
self.batch_size = batch_size
self.trt_min_shape = trt_min_shape
self.trt_max_shape = trt_max_shape
self.trt_opt_shape = trt_opt_shape
self.trt_calib_mode = trt_calib_mode
self.cpu_threads = cpu_threads
self.enable_mkldnn = enable_mkldnn
self.ir_optim = ir_optim
self.recognize_times = Timer()
model_file_path = os.path.join(model_dir, "model.pdmodel")
params_file_path = os.path.join(model_dir, "model.pdiparams")
self.config = Config(model_file_path, params_file_path)
if device == "GPU" or device == "gpu":
self.config.enable_use_gpu(8000, 0)
else:
self.config.disable_gpu()
if self.enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
self.config.set_mkldnn_cache_capacity(10)
self.config.enable_mkldnn()
self.config.switch_ir_optim(self.ir_optim) # default true
precision_map = {
'trt_int8': Config.Precision.Int8,
'trt_fp32': Config.Precision.Float32,
'trt_fp16': Config.Precision.Half
}
if run_mode in precision_map.keys():
self.config.enable_tensorrt_engine(
max_batch_size=self.batch_size,
precision_mode=precision_map[run_mode])
self.config.enable_memory_optim()
# use zero copy
self.config.switch_use_feed_fetch_ops(False)
self.predictor = create_predictor(self.config)
def preprocess_batch(self, file_list):
batched_inputs = []
for file in file_list:
inputs = self.preprocess(file)
batched_inputs.append(inputs)
batched_inputs = [
np.concatenate([item[i] for item in batched_inputs])
for i in range(len(batched_inputs[0]))
]
self.input_file = file_list
return batched_inputs
def get_timer(self):
return self.recognize_times
def predict(self, input):
'''
Args:
input (str) or (list): video file path or image data list
Returns:
results (dict):
'''
input_names = self.predictor.get_input_names()
input_tensor = self.predictor.get_input_handle(input_names[0])
output_names = self.predictor.get_output_names()
output_tensor = self.predictor.get_output_handle(output_names[0])
# preprocess
self.recognize_times.preprocess_time_s.start()
if type(input) == str:
inputs = self.preprocess_video(input)
else:
inputs = self.preprocess_frames(input)
self.recognize_times.preprocess_time_s.end()
inputs = np.expand_dims(
inputs, axis=0).repeat(
self.batch_size, axis=0).copy()
input_tensor.copy_from_cpu(inputs)
# model prediction
self.recognize_times.inference_time_s.start()
self.predictor.run()
self.recognize_times.inference_time_s.end()
output = output_tensor.copy_to_cpu()
# postprocess
self.recognize_times.postprocess_time_s.start()
classes, scores = self.postprocess(output)
self.recognize_times.postprocess_time_s.end()
return classes, scores
def preprocess_frames(self, frame_list):
"""
frame_list: list, frame list
return: list
"""
results = {}
results['frames_len'] = len(frame_list)
results["imgs"] = frame_list
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
ops = [
Scale(self.short_size), CenterCrop(self.target_size), Image2Array(),
Normalization(img_mean, img_std)
]
for op in ops:
results = op(results)
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
def preprocess_video(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'filename': input_file}
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
ops = [
VideoDecoder(), Sampler(
self.num_seg, self.seg_len, valid_mode=True),
Scale(self.short_size), CenterCrop(self.target_size), Image2Array(),
Normalization(img_mean, img_std)
]
for op in ops:
results = op(results)
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
def postprocess(self, output):
output = output.flatten() # numpy.ndarray
output = softmax(output)
classes = np.argpartition(output, -self.top_k)[-self.top_k:]
classes = classes[np.argsort(-output[classes])]
scores = output[classes]
return classes, scores
def main():
if not FLAGS.run_benchmark:
assert FLAGS.batch_size == 1
assert FLAGS.use_fp16 is False
else:
assert FLAGS.use_gpu is True
recognizer = VideoActionRecognizer(
FLAGS.model_dir,
short_size=FLAGS.short_size,
target_size=FLAGS.target_size,
device=FLAGS.device,
run_mode=FLAGS.run_mode,
batch_size=FLAGS.batch_size,
trt_min_shape=FLAGS.trt_min_shape,
trt_max_shape=FLAGS.trt_max_shape,
trt_opt_shape=FLAGS.trt_opt_shape,
trt_calib_mode=FLAGS.trt_calib_mode,
cpu_threads=FLAGS.cpu_threads,
enable_mkldnn=FLAGS.enable_mkldnn, )
if not FLAGS.run_benchmark:
classes, scores = recognizer.predict(FLAGS.video_file)
print("Current video file: {}".format(FLAGS.video_file))
print("\ttop-1 class: {0}".format(classes[0]))
print("\ttop-1 score: {0}".format(scores[0]))
else:
cm, gm, gu = get_current_memory_mb()
mems = {'cpu_rss_mb': cm, 'gpu_rss_mb': gm, 'gpu_util': gu * 100}
perf_info = recognizer.recognize_times.report()
model_dir = FLAGS.model_dir
mode = FLAGS.run_mode
model_info = {
'model_name': model_dir.strip('/').split('/')[-1],
'precision': mode.split('_')[-1]
}
data_info = {
'batch_size': FLAGS.batch_size,
'shape': "dynamic_shape",
'data_num': perf_info['img_num']
}
recognize_log = PaddleInferBenchmark(recognizer.config, model_info,
data_info, perf_info, mems)
recognize_log('Fight')
if __name__ == '__main__':
paddle.enable_static()
parser = argsparser()
FLAGS = parser.parse_args()
print_arguments(FLAGS)
FLAGS.device = FLAGS.device.upper()
assert FLAGS.device in ['CPU', 'GPU', 'XPU'
], "device should be CPU, GPU or XPU"
main()
此差异已折叠。
......@@ -365,15 +365,35 @@ def visualize_attr(im, results, boxes=None):
return im
def visualize_action(im, mot_boxes, action_visual_collector, action_text=""):
def visualize_action(im,
mot_boxes,
action_visual_collector=None,
action_text="",
video_action_score=None,
video_action_text=""):
im = cv2.imread(im) if isinstance(im, str) else im
id_detected = action_visual_collector.get_visualize_ids()
im_h, im_w = im.shape[:2]
text_scale = max(1, im.shape[1] / 1600.)
for mot_box in mot_boxes:
# mot_box is a format with [mot_id, class, score, xmin, ymin, w, h]
if mot_box[0] in id_detected:
text_position = (int(mot_box[3] + mot_box[5] * 0.75),
int(mot_box[4] - 10))
cv2.putText(im, action_text, text_position, cv2.FONT_HERSHEY_PLAIN,
text_scale, (0, 0, 255), 2)
text_thickness = 2
if action_visual_collector:
id_detected = action_visual_collector.get_visualize_ids()
for mot_box in mot_boxes:
# mot_box is a format with [mot_id, class, score, xmin, ymin, w, h]
if mot_box[0] in id_detected:
text_position = (int(mot_box[3] + mot_box[5] * 0.75),
int(mot_box[4] - 10))
cv2.putText(im, action_text, text_position,
cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), 2)
if video_action_score:
cv2.putText(
im,
video_action_text + ': %.2f' % video_action_score,
(int(im_w / 2), int(15 * text_scale) + 5),
cv2.FONT_ITALIC,
text_scale, (0, 0, 255),
thickness=text_thickness)
return im
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册