未验证 提交 5f9b0bc3 编写于 作者: W wangguanzhong 提交者: GitHub

refine keypoint deploy (#3473)

* refine keypoint deploy

* fit video infer

* fix post process

* update comments for keypoint_batch_size
上级 a1ed98ad
...@@ -171,6 +171,9 @@ class Detector(object): ...@@ -171,6 +171,9 @@ class Detector(object):
self.det_times.img_num += len(image_list) self.det_times.img_num += len(image_list)
return results return results
def get_timer(self):
return self.det_times
class DetectorSOLOv2(Detector): class DetectorSOLOv2(Detector):
""" """
...@@ -269,8 +272,8 @@ class DetectorSOLOv2(Detector): ...@@ -269,8 +272,8 @@ class DetectorSOLOv2(Detector):
def create_inputs(imgs, im_info): def create_inputs(imgs, im_info):
"""generate input for different model type """generate input for different model type
Args: Args:
im (np.ndarray): image (np.ndarray) imgs (list(numpy)): list of images (np.ndarray)
im_info (dict): info of image im_info (list(dict)): list of image info
Returns: Returns:
inputs (dict): input of model inputs (dict): input of model
""" """
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import os import os
from PIL import Image from PIL import Image
import cv2 import cv2
import math
import numpy as np import numpy as np
import paddle import paddle
...@@ -23,70 +24,94 @@ from preprocess import decode_image ...@@ -23,70 +24,94 @@ from preprocess import decode_image
from infer import Detector, PredictConfig, print_arguments, get_test_images from infer import Detector, PredictConfig, print_arguments, get_test_images
from keypoint_infer import KeyPoint_Detector, PredictConfig_KeyPoint from keypoint_infer import KeyPoint_Detector, PredictConfig_KeyPoint
from keypoint_visualize import draw_pose from keypoint_visualize import draw_pose
from benchmark_utils import PaddleInferBenchmark
from utils import get_current_memory_mb
def expand_crop(images, rect, expand_ratio=0.3): def bench_log(detector, img_list, model_info, batch_size=1, name=None):
imgh, imgw, c = images.shape mems = {
label, conf, xmin, ymin, xmax, ymax = [int(x) for x in rect.tolist()] 'cpu_rss_mb': detector.cpu_mem / len(img_list),
if label != 0: 'gpu_rss_mb': detector.gpu_mem / len(img_list),
return None, None, None 'gpu_util': detector.gpu_util * 100 / len(img_list)
org_rect = [xmin, ymin, xmax, ymax] }
h_half = (ymax - ymin) * (1 + expand_ratio) / 2. perf_info = detector.det_times.report(average=True)
w_half = (xmax - xmin) * (1 + expand_ratio) / 2. data_info = {
if h_half > w_half * 4 / 3: 'batch_size': batch_size,
w_half = h_half * 0.75 'shape': "dynamic_shape",
center = [(ymin + ymax) / 2., (xmin + xmax) / 2.] 'data_num': perf_info['img_num']
ymin = max(0, int(center[0] - h_half)) }
ymax = min(imgh - 1, int(center[0] + h_half))
xmin = max(0, int(center[1] - w_half)) log = PaddleInferBenchmark(detector.config, model_info, data_info,
xmax = min(imgw - 1, int(center[1] + w_half)) perf_info, mems)
return images[ymin:ymax, xmin:xmax, :], [xmin, ymin, xmax, ymax], org_rect log(name)
def get_person_from_rect(images, results):
det_results = results['boxes']
mask = det_results[:, 1] > FLAGS.det_threshold
valid_rects = det_results[mask]
image_buff = []
org_rects = []
for rect in valid_rects:
rect_image, new_rect, org_rect = expand_crop(images, rect)
if rect_image is None or rect_image.size == 0:
continue
image_buff.append([rect_image, new_rect])
org_rects.append(org_rect)
return image_buff, org_rects
def affine_backto_orgimages(keypoint_result, batch_records): def affine_backto_orgimages(keypoint_result, batch_records):
kpts, scores = keypoint_result['keypoint'] kpts, scores = keypoint_result['keypoint']
kpts[..., 0] += batch_records[0] kpts[..., 0] += batch_records[:, 0:1]
kpts[..., 1] += batch_records[1] kpts[..., 1] += batch_records[:, 1:2]
return kpts, scores return kpts, scores
def topdown_unite_predict(detector, topdown_keypoint_detector, image_list): def topdown_unite_predict(detector,
topdown_keypoint_detector,
image_list,
keypoint_batch_size=1):
det_timer = detector.get_timer()
for i, img_file in enumerate(image_list): for i, img_file in enumerate(image_list):
# Decode image in advance in det + pose prediction
det_timer.preprocess_time_s.start()
image, _ = decode_image(img_file, {}) image, _ = decode_image(img_file, {})
det_timer.preprocess_time_s.end()
if FLAGS.run_benchmark:
results = detector.predict(
[image], FLAGS.det_threshold, warmup=10, repeats=10)
cm, gm, gu = get_current_memory_mb()
detector.cpu_mem += cm
detector.gpu_mem += gm
detector.gpu_util += gu
else:
results = detector.predict([image], FLAGS.det_threshold) results = detector.predict([image], FLAGS.det_threshold)
if results['boxes_num'] == 0: if results['boxes_num'] == 0:
continue continue
batchs_images, det_rects = get_person_from_rect(image, results) rec_images, records, det_rects = topdown_keypoint_detector.get_person_from_rect(
image, results, FLAGS.det_threshold)
keypoint_vector = [] keypoint_vector = []
score_vector = [] score_vector = []
rect_vecotr = det_rects rect_vector = det_rects
for batch_images, batch_records in batchs_images: batch_loop_cnt = math.ceil(float(len(rec_images)) / keypoint_batch_size)
for i in range(batch_loop_cnt):
start_index = i * keypoint_batch_size
end_index = min((i + 1) * keypoint_batch_size, len(rec_images))
batch_images = rec_images[start_index:end_index]
batch_records = np.array(records[start_index:end_index])
if FLAGS.run_benchmark:
keypoint_result = topdown_keypoint_detector.predict(
batch_images,
FLAGS.keypoint_threshold,
warmup=10,
repeats=10)
else:
keypoint_result = topdown_keypoint_detector.predict( keypoint_result = topdown_keypoint_detector.predict(
batch_images, FLAGS.keypoint_threshold) batch_images, FLAGS.keypoint_threshold)
orgkeypoints, scores = affine_backto_orgimages(keypoint_result, orgkeypoints, scores = affine_backto_orgimages(keypoint_result,
batch_records) batch_records)
keypoint_vector.append(orgkeypoints) keypoint_vector.append(orgkeypoints)
score_vector.append(scores) score_vector.append(scores)
if FLAGS.run_benchmark:
cm, gm, gu = get_current_memory_mb()
topdown_keypoint_detector.cpu_mem += cm
topdown_keypoint_detector.gpu_mem += gm
topdown_keypoint_detector.gpu_util += gu
else:
keypoint_res = {} keypoint_res = {}
keypoint_res['keypoint'] = [ keypoint_res['keypoint'] = [
np.vstack(keypoint_vector), np.vstack(score_vector) np.vstack(keypoint_vector), np.vstack(score_vector)
] ]
keypoint_res['bbox'] = rect_vecotr keypoint_res['bbox'] = rect_vector
if not os.path.exists(FLAGS.output_dir): if not os.path.exists(FLAGS.output_dir):
os.makedirs(FLAGS.output_dir) os.makedirs(FLAGS.output_dir)
draw_pose( draw_pose(
...@@ -96,7 +121,10 @@ def topdown_unite_predict(detector, topdown_keypoint_detector, image_list): ...@@ -96,7 +121,10 @@ def topdown_unite_predict(detector, topdown_keypoint_detector, image_list):
save_dir=FLAGS.output_dir) save_dir=FLAGS.output_dir)
def topdown_unite_predict_video(detector, topdown_keypoint_detector, camera_id): def topdown_unite_predict_video(detector,
topdown_keypoint_detector,
camera_id,
keypoint_batch_size=1):
if camera_id != -1: if camera_id != -1:
capture = cv2.VideoCapture(camera_id) capture = cv2.VideoCapture(camera_id)
video_name = 'output.mp4' video_name = 'output.mp4'
...@@ -124,10 +152,16 @@ def topdown_unite_predict_video(detector, topdown_keypoint_detector, camera_id): ...@@ -124,10 +152,16 @@ def topdown_unite_predict_video(detector, topdown_keypoint_detector, camera_id):
frame2 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame2 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = detector.predict([frame2], FLAGS.det_threshold) results = detector.predict([frame2], FLAGS.det_threshold)
batchs_images, rect_vecotr = get_person_from_rect(frame2, results) rec_images, records, rect_vector = topdown_keypoint_detector.get_person_from_rect(
frame2, results)
keypoint_vector = [] keypoint_vector = []
score_vector = [] score_vector = []
for batch_images, batch_records in batchs_images: batch_loop_cnt = math.ceil(float(len(rec_images)) / keypoint_batch_size)
for i in range(batch_loop_cnt):
start_index = i * keypoint_batch_size
end_index = min((i + 1) * keypoint_batch_size, len(rec_images))
batch_images = rec_images[start_index:end_index]
batch_records = np.array(records[start_index:end_index])
keypoint_result = topdown_keypoint_detector.predict( keypoint_result = topdown_keypoint_detector.predict(
batch_images, FLAGS.keypoint_threshold) batch_images, FLAGS.keypoint_threshold)
orgkeypoints, scores = affine_backto_orgimages(keypoint_result, orgkeypoints, scores = affine_backto_orgimages(keypoint_result,
...@@ -138,7 +172,7 @@ def topdown_unite_predict_video(detector, topdown_keypoint_detector, camera_id): ...@@ -138,7 +172,7 @@ def topdown_unite_predict_video(detector, topdown_keypoint_detector, camera_id):
keypoint_res['keypoint'] = [ keypoint_res['keypoint'] = [
np.vstack(keypoint_vector), np.vstack(score_vector) np.vstack(keypoint_vector), np.vstack(score_vector)
] if len(keypoint_vector) > 0 else [[], []] ] if len(keypoint_vector) > 0 else [[], []]
keypoint_res['bbox'] = rect_vecotr keypoint_res['bbox'] = rect_vector
im = draw_pose( im = draw_pose(
frame, frame,
keypoint_res, keypoint_res,
...@@ -184,11 +218,30 @@ def main(): ...@@ -184,11 +218,30 @@ def main():
# predict from video file or camera video stream # predict from video file or camera video stream
if FLAGS.video_file is not None or FLAGS.camera_id != -1: if FLAGS.video_file is not None or FLAGS.camera_id != -1:
topdown_unite_predict_video(detector, topdown_keypoint_detector, topdown_unite_predict_video(detector, topdown_keypoint_detector,
FLAGS.camera_id) FLAGS.camera_id, FLAGS.keypoint_batch_size)
else: else:
# predict from image # predict from image
img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file) img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
topdown_unite_predict(detector, topdown_keypoint_detector, img_list) topdown_unite_predict(detector, topdown_keypoint_detector, img_list,
FLAGS.keypoint_batch_size)
if not FLAGS.run_benchmark:
detector.det_times.info(average=True)
topdown_keypoint_detector.det_times.info(average=True)
else:
mode = FLAGS.run_mode
det_model_dir = FLAGS.det_model_dir
det_model_info = {
'model_name': det_model_dir.strip('/').split('/')[-1],
'precision': mode.split('_')[-1]
}
bench_log(detector, img_list, det_model_info, name='Det')
keypoint_model_dir = FLAGS.keypoint_model_dir
keypoint_model_info = {
'model_name': keypoint_model_dir.strip('/').split('/')[-1],
'precision': mode.split('_')[-1]
}
bench_log(topdown_keypoint_detector, img_list, keypoint_model_info,
FLAGS.keypoint_batch_size, 'KeyPoint')
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -20,10 +20,11 @@ from functools import reduce ...@@ -20,10 +20,11 @@ from functools import reduce
from PIL import Image from PIL import Image
import cv2 import cv2
import math
import numpy as np import numpy as np
import paddle import paddle
from preprocess import preprocess, NormalizeImage, Permute from preprocess import preprocess, NormalizeImage, Permute
from keypoint_preprocess import EvalAffine, TopDownEvalAffine from keypoint_preprocess import EvalAffine, TopDownEvalAffine, expand_crop
from keypoint_postprocess import HrHRNetPostProcess, HRNetPostProcess from keypoint_postprocess import HrHRNetPostProcess, HRNetPostProcess
from keypoint_visualize import draw_pose from keypoint_visualize import draw_pose
from paddle.inference import Config from paddle.inference import Config
...@@ -82,14 +83,41 @@ class KeyPoint_Detector(object): ...@@ -82,14 +83,41 @@ class KeyPoint_Detector(object):
self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0 self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
self.use_dark = use_dark self.use_dark = use_dark
def preprocess(self, im): def get_person_from_rect(self, image, results, det_threshold=0.5):
# crop the person result from image
self.det_times.preprocess_time_s.start()
det_results = results['boxes']
mask = det_results[:, 1] > det_threshold
valid_rects = det_results[mask]
rect_images = []
new_rects = []
#image_buff = []
org_rects = []
for rect in valid_rects:
rect_image, new_rect, org_rect = expand_crop(image, rect)
if rect_image is None or rect_image.size == 0:
continue
#image_buff.append([rect_image, new_rect])
rect_images.append(rect_image)
new_rects.append(new_rect)
org_rects.append(org_rect)
self.det_times.preprocess_time_s.end()
return rect_images, new_rects, org_rects
def preprocess(self, image_list):
preprocess_ops = [] preprocess_ops = []
for op_info in self.pred_config.preprocess_infos: for op_info in self.pred_config.preprocess_infos:
new_op_info = op_info.copy() new_op_info = op_info.copy()
op_type = new_op_info.pop('type') op_type = new_op_info.pop('type')
preprocess_ops.append(eval(op_type)(**new_op_info)) preprocess_ops.append(eval(op_type)(**new_op_info))
input_im_lst = []
input_im_info_lst = []
for im in image_list:
im, im_info = preprocess(im, preprocess_ops) im, im_info = preprocess(im, preprocess_ops)
inputs = create_inputs(im, im_info) input_im_lst.append(im)
input_im_info_lst.append(im_info)
inputs = create_inputs(input_im_lst, input_im_info_lst)
return inputs return inputs
def postprocess(self, np_boxes, np_masks, inputs, threshold=0.5): def postprocess(self, np_boxes, np_masks, inputs, threshold=0.5):
...@@ -118,10 +146,10 @@ class KeyPoint_Detector(object): ...@@ -118,10 +146,10 @@ class KeyPoint_Detector(object):
raise ValueError("Unsupported arch: {}, expect {}".format( raise ValueError("Unsupported arch: {}, expect {}".format(
self.pred_config.arch, KEYPOINT_SUPPORT_MODELS)) self.pred_config.arch, KEYPOINT_SUPPORT_MODELS))
def predict(self, image, threshold=0.5, warmup=0, repeats=1): def predict(self, image_list, threshold=0.5, warmup=0, repeats=1):
''' '''
Args: Args:
image (str/np.ndarray): path of image/ np.ndarray read by cv2 image_list (list): list of image
threshold (float): threshold of predicted box' score threshold (float): threshold of predicted box' score
Returns: Returns:
results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box, results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
...@@ -130,7 +158,7 @@ class KeyPoint_Detector(object): ...@@ -130,7 +158,7 @@ class KeyPoint_Detector(object):
shape: [N, im_h, im_w] shape: [N, im_h, im_w]
''' '''
self.det_times.preprocess_time_s.start() self.det_times.preprocess_time_s.start()
inputs = self.preprocess(image) inputs = self.preprocess(image_list)
np_boxes, np_masks = None, None np_boxes, np_masks = None, None
input_names = self.predictor.get_input_names() input_names = self.predictor.get_input_names()
...@@ -172,23 +200,24 @@ class KeyPoint_Detector(object): ...@@ -172,23 +200,24 @@ class KeyPoint_Detector(object):
results = self.postprocess( results = self.postprocess(
np_boxes, np_masks, inputs, threshold=threshold) np_boxes, np_masks, inputs, threshold=threshold)
self.det_times.postprocess_time_s.end() self.det_times.postprocess_time_s.end()
self.det_times.img_num += 1 self.det_times.img_num += len(image_list)
return results return results
def create_inputs(im, im_info): def create_inputs(imgs, im_info):
"""generate input for different model type """generate input for different model type
Args: Args:
im (np.ndarray): image (np.ndarray) imgs (list(numpy)): list of image (np.ndarray)
im_info (dict): info of image im_info (list(dict)): list of image info
model_arch (str): model type
Returns: Returns:
inputs (dict): input of model inputs (dict): input of model
""" """
inputs = {} inputs = {}
inputs['image'] = np.array((im, )).astype('float32') inputs['image'] = np.stack(imgs, axis=0)
inputs['im_shape'] = np.array((im_info['im_shape'], )).astype('float32') im_shape = []
for e in im_info:
im_shape.append(np.array((e['im_shape'])).astype('float32'))
inputs['im_shape'] = np.stack(im_shape, axis=0)
return inputs return inputs
...@@ -326,14 +355,14 @@ def load_predictor(model_dir, ...@@ -326,14 +355,14 @@ def load_predictor(model_dir,
def predict_image(detector, image_list): def predict_image(detector, image_list):
for i, img_file in enumerate(image_list): for i, img_file in enumerate(image_list):
if FLAGS.run_benchmark: if FLAGS.run_benchmark:
detector.predict(img_file, FLAGS.threshold, warmup=10, repeats=10) detector.predict([img_file], FLAGS.threshold, warmup=10, repeats=10)
cm, gm, gu = get_current_memory_mb() cm, gm, gu = get_current_memory_mb()
detector.cpu_mem += cm detector.cpu_mem += cm
detector.gpu_mem += gm detector.gpu_mem += gm
detector.gpu_util += gu detector.gpu_util += gu
print('Test iter {}, file name:{}'.format(i, img_file)) print('Test iter {}, file name:{}'.format(i, img_file))
else: else:
results = detector.predict(img_file, FLAGS.threshold) results = detector.predict([img_file], FLAGS.threshold)
if not os.path.exists(FLAGS.output_dir): if not os.path.exists(FLAGS.output_dir):
os.makedirs(FLAGS.output_dir) os.makedirs(FLAGS.output_dir)
draw_pose( draw_pose(
......
...@@ -176,3 +176,21 @@ class TopDownEvalAffine(object): ...@@ -176,3 +176,21 @@ class TopDownEvalAffine(object):
flags=cv2.INTER_LINEAR) flags=cv2.INTER_LINEAR)
return image, im_info return image, im_info
def expand_crop(images, rect, expand_ratio=0.3):
imgh, imgw, c = images.shape
label, conf, xmin, ymin, xmax, ymax = [int(x) for x in rect.tolist()]
if label != 0:
return None, None, None
org_rect = [xmin, ymin, xmax, ymax]
h_half = (ymax - ymin) * (1 + expand_ratio) / 2.
w_half = (xmax - xmin) * (1 + expand_ratio) / 2.
if h_half > w_half * 4 / 3:
w_half = h_half * 0.75
center = [(ymin + ymax) / 2., (xmin + xmax) / 2.]
ymin = max(0, int(center[0] - h_half))
ymax = min(imgh - 1, int(center[0] + h_half))
xmin = max(0, int(center[1] - w_half))
xmax = min(imgw - 1, int(center[1] + w_half))
return images[ymin:ymax, xmin:xmax, :], [xmin, ymin, xmax, ymax], org_rect
...@@ -39,6 +39,13 @@ def argsparser(): ...@@ -39,6 +39,13 @@ def argsparser():
type=str, type=str,
default=None, default=None,
help="Dir of image file, `image_file` has a higher priority.") help="Dir of image file, `image_file` has a higher priority.")
parser.add_argument(
"--keypoint_batch_size",
type=int,
default=1,
help=("batch_size for keypoint inference. In detection-keypoint unit"
"inference, the batch size in detection is 1. Then collate det "
"result in batch for keypoint inference."))
parser.add_argument( parser.add_argument(
"--video_file", "--video_file",
type=str, type=str,
......
...@@ -35,7 +35,7 @@ def argsparser(): ...@@ -35,7 +35,7 @@ def argsparser():
default=None, default=None,
help="Dir of image file, `image_file` has a higher priority.") help="Dir of image file, `image_file` has a higher priority.")
parser.add_argument( parser.add_argument(
"--batch_size", type=int, default=1, help="batch_size for infer.") "--batch_size", type=int, default=1, help="batch_size for inference.")
parser.add_argument( parser.add_argument(
"--video_file", "--video_file",
type=str, type=str,
......
...@@ -46,8 +46,7 @@ class BBoxPostProcess(nn.Layer): ...@@ -46,8 +46,7 @@ class BBoxPostProcess(nn.Layer):
self.nms = nms self.nms = nms
self.fake_bboxes = paddle.to_tensor( self.fake_bboxes = paddle.to_tensor(
np.array( np.array(
[[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32'))
dtype='float32'))
self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
def forward(self, head_out, rois, im_shape, scale_factor): def forward(self, head_out, rois, im_shape, scale_factor):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册