未验证 提交 67f16ed9 编写于 作者: XYZ_916's avatar XYZ_916 提交者: GitHub

Develop branch: add fight action for pphuman (#6160)

* add fight for PP-Human

* add short_size and target_size for fight recognition

* add short_size and target_size for fight_infer

* modify code according to the reviews

* add the wrong deleted lines`

* Update pipeline.py

* Update infer_cfg.yml

* visualize fight when fight action occur

* 乱码修改

* delete useless parmas

* delete useless code str2bool
上级 ed331ba2
...@@ -25,8 +25,13 @@ ATTR: ...@@ -25,8 +25,13 @@ ATTR:
enable: False enable: False
VIDEO_ACTION: VIDEO_ACTION:
model_dir: output_inference/pp-stm model_dir: output_inference/ppTSM
batch_size: 1 batch_size: 1
frame_len: 8
sample_freq: 7
short_size: 340
target_size: 320
basemode: "videobased"
enable: False enable: False
SKELETON_ACTION: SKELETON_ACTION:
......
...@@ -23,6 +23,7 @@ class Result(object): ...@@ -23,6 +23,7 @@ class Result(object):
'mot': dict(), 'mot': dict(),
'attr': dict(), 'attr': dict(),
'kpt': dict(), 'kpt': dict(),
'video_action': dict(),
'skeleton_action': dict(), 'skeleton_action': dict(),
'reid': dict() 'reid': dict()
} }
......
...@@ -152,6 +152,7 @@ class PipeTimer(Times): ...@@ -152,6 +152,7 @@ class PipeTimer(Times):
'mot': Times(), 'mot': Times(),
'attr': Times(), 'attr': Times(),
'kpt': Times(), 'kpt': Times(),
'video_action': Times(),
'skeleton_action': Times(), 'skeleton_action': Times(),
'reid': Times() 'reid': Times()
} }
...@@ -197,6 +198,7 @@ class PipeTimer(Times): ...@@ -197,6 +198,7 @@ class PipeTimer(Times):
dic['kpt'] = round(self.module_time['kpt'].value() / dic['kpt'] = round(self.module_time['kpt'].value() /
max(1, self.img_num), max(1, self.img_num),
4) if average else self.module_time['kpt'].value() 4) if average else self.module_time['kpt'].value()
dic['video_action'] = self.module_time['video_action'].value()
dic['skeleton_action'] = round( dic['skeleton_action'] = round(
self.module_time['skeleton_action'].value() / max(1, self.img_num), self.module_time['skeleton_action'].value() / max(1, self.img_num),
4) if average else self.module_time['skeleton_action'].value() 4) if average else self.module_time['skeleton_action'].value()
......
...@@ -36,6 +36,7 @@ from python.infer import Detector, DetectorPicoDet ...@@ -36,6 +36,7 @@ from python.infer import Detector, DetectorPicoDet
from python.attr_infer import AttrDetector from python.attr_infer import AttrDetector
from python.keypoint_infer import KeyPointDetector from python.keypoint_infer import KeyPointDetector
from python.keypoint_postprocess import translate_to_ori_images from python.keypoint_postprocess import translate_to_ori_images
from python.video_action_infer import VideoActionRecognizer
from python.action_infer import SkeletonActionRecognizer from python.action_infer import SkeletonActionRecognizer
from python.action_utils import KeyPointBuff, SkeletonActionVisualHelper from python.action_utils import KeyPointBuff, SkeletonActionVisualHelper
...@@ -75,7 +76,7 @@ class Pipeline(object): ...@@ -75,7 +76,7 @@ class Pipeline(object):
draw_center_traj (bool): Whether drawing the trajectory of center, default as False draw_center_traj (bool): Whether drawing the trajectory of center, default as False
secs_interval (int): The seconds interval to count after tracking, default as 10 secs_interval (int): The seconds interval to count after tracking, default as 10
do_entrance_counting(bool): Whether counting the numbers of identifiers entering do_entrance_counting(bool): Whether counting the numbers of identifiers entering
or getting out from the entrance, default as Falseonly support single class or getting out from the entrance, default as False, only support single class
counting in MOT. counting in MOT.
""" """
...@@ -181,7 +182,7 @@ class Pipeline(object): ...@@ -181,7 +182,7 @@ class Pipeline(object):
else: else:
raise ValueError( raise ValueError(
"Illegal Input, please set one of ['video_file','camera_id','image_file', 'image_dir']" "Illegal Input, please set one of ['video_file', 'camera_id', 'image_file', 'image_dir']"
) )
return input return input
...@@ -218,6 +219,7 @@ class PipePredictor(object): ...@@ -218,6 +219,7 @@ class PipePredictor(object):
1. Tracking 1. Tracking
2. Tracking -> Attribute 2. Tracking -> Attribute
3. Tracking -> KeyPoint -> SkeletonAction Recognition 3. Tracking -> KeyPoint -> SkeletonAction Recognition
4. VideoAction Recognition
Args: Args:
cfg (dict): config of models in pipeline cfg (dict): config of models in pipeline
...@@ -240,7 +242,7 @@ class PipePredictor(object): ...@@ -240,7 +242,7 @@ class PipePredictor(object):
draw_center_traj (bool): Whether drawing the trajectory of center, default as False draw_center_traj (bool): Whether drawing the trajectory of center, default as False
secs_interval (int): The seconds interval to count after tracking, default as 10 secs_interval (int): The seconds interval to count after tracking, default as 10
do_entrance_counting(bool): Whether counting the numbers of identifiers entering do_entrance_counting(bool): Whether counting the numbers of identifiers entering
or getting out from the entrance, default as Falseonly support single class or getting out from the entrance, default as False, only support single class
counting in MOT. counting in MOT.
""" """
...@@ -277,6 +279,7 @@ class PipePredictor(object): ...@@ -277,6 +279,7 @@ class PipePredictor(object):
'ID_BASED_CLSACTION', False) else False 'ID_BASED_CLSACTION', False) else False
self.with_mtmct = cfg.get('REID', False)['enable'] if cfg.get( self.with_mtmct = cfg.get('REID', False)['enable'] if cfg.get(
'REID', False) else False 'REID', False) else False
if self.with_attr: if self.with_attr:
print('Attribute Recognition enabled') print('Attribute Recognition enabled')
if self.with_skeleton_action: if self.with_skeleton_action:
...@@ -296,6 +299,7 @@ class PipePredictor(object): ...@@ -296,6 +299,7 @@ class PipePredictor(object):
"idbased": False, "idbased": False,
"skeletonbased": False "skeletonbased": False
} }
self.is_video = is_video self.is_video = is_video
self.multi_camera = multi_camera self.multi_camera = multi_camera
self.cfg = cfg self.cfg = cfg
...@@ -416,6 +420,31 @@ class PipePredictor(object): ...@@ -416,6 +420,31 @@ class PipePredictor(object):
use_dark=False) use_dark=False)
self.kpt_buff = KeyPointBuff(skeleton_action_frames) self.kpt_buff = KeyPointBuff(skeleton_action_frames)
if self.with_video_action:
video_action_cfg = self.cfg['VIDEO_ACTION']
basemode = video_action_cfg['basemode']
self.modebase[basemode] = True
video_action_model_dir = video_action_cfg['model_dir']
video_action_batch_size = video_action_cfg['batch_size']
short_size = video_action_cfg["short_size"]
target_size = video_action_cfg["target_size"]
self.video_action_predictor = VideoActionRecognizer(
model_dir=video_action_model_dir,
short_size=short_size,
target_size=target_size,
device=device,
run_mode=run_mode,
batch_size=video_action_batch_size,
trt_min_shape=trt_min_shape,
trt_max_shape=trt_max_shape,
trt_opt_shape=trt_opt_shape,
trt_calib_mode=trt_calib_mode,
cpu_threads=cpu_threads,
enable_mkldnn=enable_mkldnn)
if self.with_mtmct: if self.with_mtmct:
reid_cfg = self.cfg['REID'] reid_cfg = self.cfg['REID']
model_dir = reid_cfg['model_dir'] model_dir = reid_cfg['model_dir']
...@@ -523,9 +552,12 @@ class PipePredictor(object): ...@@ -523,9 +552,12 @@ class PipePredictor(object):
entrance = [0, height / 2., width, height / 2.] entrance = [0, height / 2., width, height / 2.]
video_fps = fps video_fps = fps
video_action_imgs = []
while (1): while (1):
if frame_id % 10 == 0: if frame_id % 10 == 0:
print('frame id: ', frame_id) print('frame id: ', frame_id)
ret, frame = capture.read() ret, frame = capture.read()
if not ret: if not ret:
break break
...@@ -660,10 +692,34 @@ class PipePredictor(object): ...@@ -660,10 +692,34 @@ class PipePredictor(object):
self.pipeline_res.clear('reid') self.pipeline_res.clear('reid')
if self.with_video_action: if self.with_video_action:
#predeal, get what your model need # get the params
#predict, model preprocess\run\postprocess frame_len = self.cfg["VIDEO_ACTION"]["frame_len"]
#postdeal, interact with pipeline sample_freq = self.cfg["VIDEO_ACTION"]["sample_freq"]
pass
if sample_freq * frame_len > frame_count: # video is too short
sample_freq = int(frame_count / frame_len)
# filter the warmup frames
if frame_id > self.warmup_frame:
self.pipe_timer.module_time['video_action'].start()
# collect frames
if frame_id % sample_freq == 0:
video_action_imgs.append(frame)
# the number of collected frames is enough to predict video action
if len(video_action_imgs) == frame_len:
classes, scores = self.video_action_predictor.predict(
video_action_imgs)
if frame_id > self.warmup_frame:
self.pipe_timer.module_time['video_action'].end()
video_action_res = {"class": classes[0], "score": scores[0]}
self.pipeline_res.update(video_action_res, 'video_action')
print("video_action_res:", video_action_res)
video_action_imgs.clear() # next clip
self.collector.append(frame_id, self.pipeline_res) self.collector.append(frame_id, self.pipeline_res)
...@@ -744,10 +800,21 @@ class PipePredictor(object): ...@@ -744,10 +800,21 @@ class PipePredictor(object):
returnimg=True) returnimg=True)
skeleton_action_res = result.get('skeleton_action') skeleton_action_res = result.get('skeleton_action')
if skeleton_action_res is not None: video_action_res = result.get('video_action')
image = visualize_action(image, mot_res['boxes'], if skeleton_action_res is not None or video_action_res is not None:
self.skeleton_action_visual_helper, video_action_score = None
"SkeletonAction") action_visual_helper = None
if video_action_res and video_action_res["class"] == 1:
video_action_score = video_action_res["score"]
if skeleton_action_res:
action_visual_helper = self.skeleton_action_visual_helper
image = visualize_action(
image,
mot_res['boxes'],
action_visual_collector=action_visual_helper,
action_text="SkeletonAction",
video_action_score=video_action_score,
video_action_text="Fight")
return image return image
...@@ -784,6 +851,7 @@ class PipePredictor(object): ...@@ -784,6 +851,7 @@ class PipePredictor(object):
def main(): def main():
cfg = merge_cfg(FLAGS) cfg = merge_cfg(FLAGS)
print_arguments(cfg) print_arguments(cfg)
pipeline = Pipeline( pipeline = Pipeline(
cfg, FLAGS.image_file, FLAGS.image_dir, FLAGS.video_file, cfg, FLAGS.image_file, FLAGS.image_dir, FLAGS.video_file,
FLAGS.video_dir, FLAGS.camera_id, FLAGS.device, FLAGS.run_mode, FLAGS.video_dir, FLAGS.camera_id, FLAGS.device, FLAGS.run_mode,
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import yaml
import glob
import cv2
import numpy as np
import math
import paddle
import sys
from collections import Sequence
import paddle.nn.functional as F
# add deploy path of PadleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)
from paddle.inference import Config, create_predictor
from utils import argsparser, Timer, get_current_memory_mb
from benchmark_utils import PaddleInferBenchmark
from infer import Detector, print_arguments
from video_action_preprocess import VideoDecoder, Sampler, Scale, CenterCrop, Normalization, Image2Array
def softmax(x):
f_x = np.exp(x) / np.sum(np.exp(x))
return f_x
class VideoActionRecognizer(object):
"""
Args:
model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
batch_size (int): size of pre batch in inference
trt_min_shape (int): min shape for dynamic shape in trt
trt_max_shape (int): max shape for dynamic shape in trt
trt_opt_shape (int): opt shape for dynamic shape in trt
trt_calib_mode (bool): If the model is produced by TRT offline quantitative
calibration, trt_calib_mode need to set True
cpu_threads (int): cpu threads
enable_mkldnn (bool): whether to open MKLDNN
"""
def __init__(self,
model_dir,
device='CPU',
run_mode='paddle',
num_seg=8,
seg_len=1,
short_size=256,
target_size=224,
top_k=1,
batch_size=1,
trt_min_shape=1,
trt_max_shape=1280,
trt_opt_shape=640,
trt_calib_mode=False,
cpu_threads=1,
enable_mkldnn=False,
ir_optim=True):
self.num_seg = num_seg
self.seg_len = seg_len
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
assert batch_size == 1, "VideoActionRecognizer only support batch_size=1 now."
self.model_dir = model_dir
self.device = device
self.run_mode = run_mode
self.batch_size = batch_size
self.trt_min_shape = trt_min_shape
self.trt_max_shape = trt_max_shape
self.trt_opt_shape = trt_opt_shape
self.trt_calib_mode = trt_calib_mode
self.cpu_threads = cpu_threads
self.enable_mkldnn = enable_mkldnn
self.ir_optim = ir_optim
self.recognize_times = Timer()
model_file_path = os.path.join(model_dir, "model.pdmodel")
params_file_path = os.path.join(model_dir, "model.pdiparams")
self.config = Config(model_file_path, params_file_path)
if device == "GPU" or device == "gpu":
self.config.enable_use_gpu(8000, 0)
else:
self.config.disable_gpu()
if self.enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
self.config.set_mkldnn_cache_capacity(10)
self.config.enable_mkldnn()
self.config.switch_ir_optim(self.ir_optim) # default true
precision_map = {
'trt_int8': Config.Precision.Int8,
'trt_fp32': Config.Precision.Float32,
'trt_fp16': Config.Precision.Half
}
if run_mode in precision_map.keys():
self.config.enable_tensorrt_engine(
max_batch_size=self.batch_size,
precision_mode=precision_map[run_mode])
self.config.enable_memory_optim()
# use zero copy
self.config.switch_use_feed_fetch_ops(False)
self.predictor = create_predictor(self.config)
def preprocess_batch(self, file_list):
batched_inputs = []
for file in file_list:
inputs = self.preprocess(file)
batched_inputs.append(inputs)
batched_inputs = [
np.concatenate([item[i] for item in batched_inputs])
for i in range(len(batched_inputs[0]))
]
self.input_file = file_list
return batched_inputs
def get_timer(self):
return self.recognize_times
def predict(self, input):
'''
Args:
input (str) or (list): video file path or image data list
Returns:
results (dict):
'''
input_names = self.predictor.get_input_names()
input_tensor = self.predictor.get_input_handle(input_names[0])
output_names = self.predictor.get_output_names()
output_tensor = self.predictor.get_output_handle(output_names[0])
# preprocess
self.recognize_times.preprocess_time_s.start()
if type(input) == str:
inputs = self.preprocess_video(input)
else:
inputs = self.preprocess_frames(input)
self.recognize_times.preprocess_time_s.end()
inputs = np.expand_dims(
inputs, axis=0).repeat(
self.batch_size, axis=0).copy()
input_tensor.copy_from_cpu(inputs)
# model prediction
self.recognize_times.inference_time_s.start()
self.predictor.run()
self.recognize_times.inference_time_s.end()
output = output_tensor.copy_to_cpu()
# postprocess
self.recognize_times.postprocess_time_s.start()
classes, scores = self.postprocess(output)
self.recognize_times.postprocess_time_s.end()
return classes, scores
def preprocess_frames(self, frame_list):
"""
frame_list: list, frame list
return: list
"""
results = {}
results['frames_len'] = len(frame_list)
results["imgs"] = frame_list
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
ops = [
Scale(self.short_size), CenterCrop(self.target_size), Image2Array(),
Normalization(img_mean, img_std)
]
for op in ops:
results = op(results)
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
def preprocess_video(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'filename': input_file}
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
ops = [
VideoDecoder(), Sampler(
self.num_seg, self.seg_len, valid_mode=True),
Scale(self.short_size), CenterCrop(self.target_size), Image2Array(),
Normalization(img_mean, img_std)
]
for op in ops:
results = op(results)
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
def postprocess(self, output):
output = output.flatten() # numpy.ndarray
output = softmax(output)
classes = np.argpartition(output, -self.top_k)[-self.top_k:]
classes = classes[np.argsort(-output[classes])]
scores = output[classes]
return classes, scores
def main():
if not FLAGS.run_benchmark:
assert FLAGS.batch_size == 1
assert FLAGS.use_fp16 is False
else:
assert FLAGS.use_gpu is True
recognizer = VideoActionRecognizer(
FLAGS.model_dir,
short_size=FLAGS.short_size,
target_size=FLAGS.target_size,
device=FLAGS.device,
run_mode=FLAGS.run_mode,
batch_size=FLAGS.batch_size,
trt_min_shape=FLAGS.trt_min_shape,
trt_max_shape=FLAGS.trt_max_shape,
trt_opt_shape=FLAGS.trt_opt_shape,
trt_calib_mode=FLAGS.trt_calib_mode,
cpu_threads=FLAGS.cpu_threads,
enable_mkldnn=FLAGS.enable_mkldnn, )
if not FLAGS.run_benchmark:
classes, scores = recognizer.predict(FLAGS.video_file)
print("Current video file: {}".format(FLAGS.video_file))
print("\ttop-1 class: {0}".format(classes[0]))
print("\ttop-1 score: {0}".format(scores[0]))
else:
cm, gm, gu = get_current_memory_mb()
mems = {'cpu_rss_mb': cm, 'gpu_rss_mb': gm, 'gpu_util': gu * 100}
perf_info = recognizer.recognize_times.report()
model_dir = FLAGS.model_dir
mode = FLAGS.run_mode
model_info = {
'model_name': model_dir.strip('/').split('/')[-1],
'precision': mode.split('_')[-1]
}
data_info = {
'batch_size': FLAGS.batch_size,
'shape': "dynamic_shape",
'data_num': perf_info['img_num']
}
recognize_log = PaddleInferBenchmark(recognizer.config, model_info,
data_info, perf_info, mems)
recognize_log('Fight')
if __name__ == '__main__':
paddle.enable_static()
parser = argsparser()
FLAGS = parser.parse_args()
print_arguments(FLAGS)
FLAGS.device = FLAGS.device.upper()
assert FLAGS.device in ['CPU', 'GPU', 'XPU'
], "device should be CPU, GPU or XPU"
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import cv2
import numpy as np
from collections.abc import Sequence
from PIL import Image
import paddle
class Sampler(object):
"""
Sample frames id.
NOTE: Use PIL to read image here, has diff with CV2
Args:
num_seg(int): number of segments.
seg_len(int): number of sampled frames in each segment.
valid_mode(bool): True or False.
Returns:
frames_idx: the index of sampled #frames.
"""
def __init__(self,
num_seg,
seg_len,
frame_interval=None,
valid_mode=True,
dense_sample=False,
linspace_sample=False,
use_pil=True):
self.num_seg = num_seg
self.seg_len = seg_len
self.frame_interval = frame_interval
self.valid_mode = valid_mode
self.dense_sample = dense_sample
self.linspace_sample = linspace_sample
self.use_pil = use_pil
def _get(self, frames_idx, results):
data_format = results['format']
if data_format == "frame":
frame_dir = results['frame_dir']
imgs = []
for idx in frames_idx:
img = Image.open(
os.path.join(frame_dir, results['suffix'].format(
idx))).convert('RGB')
imgs.append(img)
elif data_format == "video":
if results['backend'] == 'cv2':
frames = np.array(results['frames'])
imgs = []
for idx in frames_idx:
imgbuf = frames[idx]
img = Image.fromarray(imgbuf, mode='RGB')
imgs.append(img)
elif results['backend'] == 'decord':
container = results['frames']
if self.use_pil:
frames_select = container.get_batch(frames_idx)
# dearray_to_img
np_frames = frames_select.asnumpy()
imgs = []
for i in range(np_frames.shape[0]):
imgbuf = np_frames[i]
imgs.append(Image.fromarray(imgbuf, mode='RGB'))
else:
if frames_idx.ndim != 1:
frames_idx = np.squeeze(frames_idx)
frame_dict = {
idx: container[idx].asnumpy()
for idx in np.unique(frames_idx)
}
imgs = [frame_dict[idx] for idx in frames_idx]
elif results['backend'] == 'pyav':
imgs = []
frames = np.array(results['frames'])
for idx in frames_idx:
imgbuf = frames[idx]
imgs.append(imgbuf)
imgs = np.stack(imgs) # thwc
else:
raise NotImplementedError
else:
raise NotImplementedError
results['imgs'] = imgs # all image data
return results
def _get_train_clips(self, num_frames):
ori_seg_len = self.seg_len * self.frame_interval
avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
if avg_interval > 0:
base_offsets = np.arange(self.num_seg) * avg_interval
clip_offsets = base_offsets + np.random.randint(
avg_interval, size=self.num_seg)
elif num_frames > max(self.num_seg, ori_seg_len):
clip_offsets = np.sort(
np.random.randint(
num_frames - ori_seg_len + 1, size=self.num_seg))
elif avg_interval == 0:
ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
clip_offsets = np.around(np.arange(self.num_seg) * ratio)
else:
clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
return clip_offsets
def _get_test_clips(self, num_frames):
ori_seg_len = self.seg_len * self.frame_interval
avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
if num_frames > ori_seg_len - 1:
base_offsets = np.arange(self.num_seg) * avg_interval
clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
else:
clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
return clip_offsets
def __call__(self, results):
"""
Args:
frames_len: length of frames.
return:
sampling id.
"""
frames_len = int(results['frames_len']) # total number of frames
frames_idx = []
if self.frame_interval is not None:
assert isinstance(self.frame_interval, int)
if not self.valid_mode:
offsets = self._get_train_clips(frames_len)
else:
offsets = self._get_test_clips(frames_len)
offsets = offsets[:, None] + np.arange(self.seg_len)[
None, :] * self.frame_interval
offsets = np.concatenate(offsets)
offsets = offsets.reshape((-1, self.seg_len))
offsets = np.mod(offsets, frames_len)
offsets = np.concatenate(offsets)
if results['format'] == 'video':
frames_idx = offsets
elif results['format'] == 'frame':
frames_idx = list(offsets + 1)
else:
raise NotImplementedError
return self._get(frames_idx, results)
print("self.frame_interval:", self.frame_interval)
if self.linspace_sample: # default if False
if 'start_idx' in results and 'end_idx' in results:
offsets = np.linspace(results['start_idx'], results['end_idx'],
self.num_seg)
else:
offsets = np.linspace(0, frames_len - 1, self.num_seg)
offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
if results['format'] == 'video':
frames_idx = list(offsets)
frames_idx = [x % frames_len for x in frames_idx]
elif results['format'] == 'frame':
frames_idx = list(offsets + 1)
else:
raise NotImplementedError
return self._get(frames_idx, results)
average_dur = int(frames_len / self.num_seg)
print("results['format']:", results['format'])
if self.dense_sample: # For ppTSM, default is False
if not self.valid_mode: # train
sample_pos = max(1, 1 + frames_len - 64)
t_stride = 64 // self.num_seg
start_idx = 0 if sample_pos == 1 else np.random.randint(
0, sample_pos - 1)
offsets = [(idx * t_stride + start_idx) % frames_len + 1
for idx in range(self.num_seg)]
frames_idx = offsets
else:
sample_pos = max(1, 1 + frames_len - 64)
t_stride = 64 // self.num_seg
start_list = np.linspace(0, sample_pos - 1, num=10, dtype=int)
offsets = []
for start_idx in start_list.tolist():
offsets += [(idx * t_stride + start_idx) % frames_len + 1
for idx in range(self.num_seg)]
frames_idx = offsets
else:
for i in range(self.num_seg):
idx = 0
if not self.valid_mode:
if average_dur >= self.seg_len:
idx = random.randint(0, average_dur - self.seg_len)
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
else:
if average_dur >= self.seg_len:
idx = (average_dur - 1) // 2
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
for jj in range(idx, idx + self.seg_len):
if results['format'] == 'video':
frames_idx.append(int(jj % frames_len))
elif results['format'] == 'frame':
frames_idx.append(jj + 1)
elif results['format'] == 'MRI':
frames_idx.append(jj)
else:
raise NotImplementedError
return self._get(frames_idx, results)
class Scale(object):
"""
Scale images.
Args:
short_size(float | int): Short size of an image will be scaled to the short_size.
fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True
do_round(bool): Whether to round up when calculating the zoom ratio. default: False
backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'
"""
def __init__(self,
short_size,
fixed_ratio=True,
keep_ratio=None,
do_round=False,
backend='pillow'):
self.short_size = short_size
assert (fixed_ratio and not keep_ratio) or (
not fixed_ratio
), "fixed_ratio and keep_ratio cannot be true at the same time"
self.fixed_ratio = fixed_ratio
self.keep_ratio = keep_ratio
self.do_round = do_round
assert backend in [
'pillow', 'cv2'
], "Scale's backend must be pillow or cv2, but get {backend}"
self.backend = backend
def __call__(self, results):
"""
Performs resize operations.
Args:
imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
return:
resized_imgs: List where each item is a PIL.Image after scaling.
"""
imgs = results['imgs']
resized_imgs = []
for i in range(len(imgs)):
img = imgs[i]
if isinstance(img, np.ndarray):
h, w, _ = img.shape
elif isinstance(img, Image.Image):
w, h = img.size
else:
raise NotImplementedError
if w <= h:
ow = self.short_size
if self.fixed_ratio: # default is True
oh = int(self.short_size * 4.0 / 3.0)
elif not self.keep_ratio: # no
oh = self.short_size
else:
scale_factor = self.short_size / w
oh = int(h * float(scale_factor) +
0.5) if self.do_round else int(h *
self.short_size / w)
ow = int(w * float(scale_factor) +
0.5) if self.do_round else int(w *
self.short_size / h)
else:
oh = self.short_size
if self.fixed_ratio:
ow = int(self.short_size * 4.0 / 3.0)
elif not self.keep_ratio: # no
ow = self.short_size
else:
scale_factor = self.short_size / h
oh = int(h * float(scale_factor) +
0.5) if self.do_round else int(h *
self.short_size / w)
ow = int(w * float(scale_factor) +
0.5) if self.do_round else int(w *
self.short_size / h)
if type(img) == np.ndarray:
img = Image.fromarray(img, mode='RGB')
if self.backend == 'pillow':
resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
elif self.backend == 'cv2' and (self.keep_ratio is not None):
resized_imgs.append(
cv2.resize(
img, (ow, oh), interpolation=cv2.INTER_LINEAR))
else:
resized_imgs.append(
Image.fromarray(
cv2.resize(
np.asarray(img), (ow, oh),
interpolation=cv2.INTER_LINEAR)))
results['imgs'] = resized_imgs
return results
class CenterCrop(object):
"""
Center crop images
Args:
target_size(int): Center crop a square with the target_size from an image.
do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True
"""
def __init__(self, target_size, do_round=True, backend='pillow'):
self.target_size = target_size
self.do_round = do_round
self.backend = backend
def __call__(self, results):
"""
Performs Center crop operations.
Args:
imgs: List where each item is a PIL.Image.
For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
return:
ccrop_imgs: List where each item is a PIL.Image after Center crop.
"""
imgs = results['imgs']
ccrop_imgs = []
th, tw = self.target_size, self.target_size
if isinstance(imgs, paddle.Tensor):
h, w = imgs.shape[-2:]
x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2
y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2
ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw]
else:
for img in imgs:
if self.backend == 'pillow':
w, h = img.size
elif self.backend == 'cv2':
h, w, _ = img.shape
else:
raise NotImplementedError
assert (w >= self.target_size) and (h >= self.target_size), \
"image width({}) and height({}) should be larger than crop size".format(
w, h, self.target_size)
x1 = int(round((w - tw) / 2.0)) if self.do_round else (
w - tw) // 2
y1 = int(round((h - th) / 2.0)) if self.do_round else (
h - th) // 2
if self.backend == 'cv2':
ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw])
elif self.backend == 'pillow':
ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))
results['imgs'] = ccrop_imgs
return results
class Image2Array(object):
"""
transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.
Args:
transpose: whether to transpose or not, default True, False for slowfast.
"""
def __init__(self, transpose=True, data_format='tchw'):
assert data_format in [
'tchw', 'cthw'
], "Target format must in ['tchw', 'cthw'], but got {data_format}"
self.transpose = transpose
self.data_format = data_format
def __call__(self, results):
"""
Performs Image to NumpyArray operations.
Args:
imgs: List where each item is a PIL.Image.
For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
return:
np_imgs: Numpy array.
"""
imgs = results['imgs']
if 'backend' in results and results[
'backend'] == 'pyav': # [T,H,W,C] in [0, 1]
if self.transpose:
if self.data_format == 'tchw':
t_imgs = imgs.transpose((0, 3, 1, 2)) # tchw
else:
t_imgs = imgs.transpose((3, 0, 1, 2)) # cthw
results['imgs'] = t_imgs
else:
t_imgs = np.stack(imgs).astype('float32')
if self.transpose:
if self.data_format == 'tchw':
t_imgs = t_imgs.transpose(0, 3, 1, 2) # tchw
else:
t_imgs = t_imgs.transpose(3, 0, 1, 2) # cthw
results['imgs'] = t_imgs
return results
class VideoDecoder(object):
"""
Decode mp4 file to frames.
Args:
filepath: the file path of mp4 file
"""
def __init__(self,
backend='cv2',
mode='train',
sampling_rate=32,
num_seg=8,
num_clips=1,
target_fps=30):
self.backend = backend
# params below only for TimeSformer
self.mode = mode
self.sampling_rate = sampling_rate
self.num_seg = num_seg
self.num_clips = num_clips
self.target_fps = target_fps
def __call__(self, results):
"""
Perform mp4 decode operations.
return:
List where each item is a numpy array after decoder.
"""
file_path = results['filename']
results['format'] = 'video'
results['backend'] = self.backend
if self.backend == 'cv2': # here
cap = cv2.VideoCapture(file_path)
videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
sampledFrames = []
for i in range(videolen):
ret, frame = cap.read()
# maybe first frame is empty
if ret == False:
continue
img = frame[:, :, ::-1]
sampledFrames.append(img)
results['frames'] = sampledFrames
results['frames_len'] = len(sampledFrames)
elif self.backend == 'decord':
container = de.VideoReader(file_path)
frames_len = len(container)
results['frames'] = container
results['frames_len'] = frames_len
else:
raise NotImplementedError
return results
class Normalization(object):
"""
Normalization.
Args:
mean(Sequence[float]): mean values of different channels.
std(Sequence[float]): std values of different channels.
tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]
"""
def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False):
if not isinstance(mean, Sequence):
raise TypeError(
'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
if not isinstance(std, Sequence):
raise TypeError(
'Std must be list, tuple or np.ndarray, but got {type(std)}')
self.inplace = inplace
if not inplace:
self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)
self.std = np.array(std).reshape(tensor_shape).astype(np.float32)
else:
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
def __call__(self, results):
"""
Performs normalization operations.
Args:
imgs: Numpy array.
return:
np_imgs: Numpy array after normalization.
"""
if self.inplace: # default is False
n = len(results['imgs'])
h, w, c = results['imgs'][0].shape
norm_imgs = np.empty((n, h, w, c), dtype=np.float32)
for i, img in enumerate(results['imgs']):
norm_imgs[i] = img
for img in norm_imgs: # [n,h,w,c]
mean = np.float64(self.mean.reshape(1, -1)) # [1, 3]
stdinv = 1 / np.float64(self.std.reshape(1, -1)) # [1, 3]
cv2.subtract(img, mean, img)
cv2.multiply(img, stdinv, img)
else:
imgs = results['imgs']
norm_imgs = imgs / 255.0
norm_imgs -= self.mean
norm_imgs /= self.std
if 'backend' in results and results['backend'] == 'pyav':
norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32)
results['imgs'] = norm_imgs
return results
...@@ -365,15 +365,35 @@ def visualize_attr(im, results, boxes=None): ...@@ -365,15 +365,35 @@ def visualize_attr(im, results, boxes=None):
return im return im
def visualize_action(im, mot_boxes, action_visual_collector, action_text=""): def visualize_action(im,
mot_boxes,
action_visual_collector=None,
action_text="",
video_action_score=None,
video_action_text=""):
im = cv2.imread(im) if isinstance(im, str) else im im = cv2.imread(im) if isinstance(im, str) else im
id_detected = action_visual_collector.get_visualize_ids() im_h, im_w = im.shape[:2]
text_scale = max(1, im.shape[1] / 1600.) text_scale = max(1, im.shape[1] / 1600.)
text_thickness = 2
if action_visual_collector:
id_detected = action_visual_collector.get_visualize_ids()
for mot_box in mot_boxes: for mot_box in mot_boxes:
# mot_box is a format with [mot_id, class, score, xmin, ymin, w, h] # mot_box is a format with [mot_id, class, score, xmin, ymin, w, h]
if mot_box[0] in id_detected: if mot_box[0] in id_detected:
text_position = (int(mot_box[3] + mot_box[5] * 0.75), text_position = (int(mot_box[3] + mot_box[5] * 0.75),
int(mot_box[4] - 10)) int(mot_box[4] - 10))
cv2.putText(im, action_text, text_position, cv2.FONT_HERSHEY_PLAIN, cv2.putText(im, action_text, text_position,
text_scale, (0, 0, 255), 2) cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), 2)
if video_action_score:
cv2.putText(
im,
video_action_text + ': %.2f' % video_action_score,
(int(im_w / 2), int(15 * text_scale) + 5),
cv2.FONT_ITALIC,
text_scale, (0, 0, 255),
thickness=text_thickness)
return im return im
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册