未验证 提交 954bae8b 编写于 作者: G Guanghua Yu 提交者: GitHub

update YOLO series paddle trt infer (#1400)

上级 a77e2d68
......@@ -16,12 +16,70 @@ import os
import cv2
import numpy as np
import argparse
from tqdm import tqdm
import pkg_resources as pkg
import time
import paddle
from paddle.inference import Config
from paddle.inference import create_predictor
from dataset import COCOValDataset
from post_process import YOLOPostProcess, coco_metric
def argsparser():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--model_path', type=str, help="inference model filepath")
parser.add_argument(
'--image_file',
type=str,
default=None,
help="image path, if set image_file, it will not eval coco.")
parser.add_argument(
'--dataset_dir',
type=str,
default='dataset/coco',
help="COCO dataset dir.")
parser.add_argument(
'--val_image_dir',
type=str,
default='val2017',
help="COCO dataset val image dir.")
parser.add_argument(
'--val_anno_path',
type=str,
default='annotations/instances_val2017.json',
help="COCO dataset anno path.")
parser.add_argument(
'--benchmark',
type=bool,
default=False,
help="Whether run benchmark or not.")
parser.add_argument(
'--use_dynamic_shape',
type=bool,
default=True,
help="Whether use dynamic shape or not.")
parser.add_argument(
'--run_mode',
type=str,
default='paddle',
help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)")
parser.add_argument(
'--device',
type=str,
default='GPU',
help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU"
)
parser.add_argument(
'--arch', type=str, default='YOLOv5', help="architectures name.")
parser.add_argument('--img_shape', type=int, default=640, help="input_size")
parser.add_argument(
'--batch_size', type=int, default=1, help="Batch size of model input.")
return parser
from post_process import YOLOPostProcess
CLASS_LABEL = [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
......@@ -40,56 +98,28 @@ CLASS_LABEL = [
]
def generate_scale(im, target_shape, keep_ratio=True):
"""
Args:
im (np.ndarray): image (np.ndarray)
Returns:
im_scale_x: the resize ratio of X
im_scale_y: the resize ratio of Y
"""
origin_shape = im.shape[:2]
if keep_ratio:
im_size_min = np.min(origin_shape)
im_size_max = np.max(origin_shape)
target_size_min = np.min(target_shape)
target_size_max = np.max(target_shape)
im_scale = float(target_size_min) / float(im_size_min)
if np.round(im_scale * im_size_max) > target_size_max:
im_scale = float(target_size_max) / float(im_size_max)
im_scale_x = im_scale
im_scale_y = im_scale
def preprocess(image, input_size, mean=None, std=None, swap=(2, 0, 1)):
if len(image.shape) == 3:
padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
else:
resize_h, resize_w = target_shape
im_scale_y = resize_h / float(origin_shape[0])
im_scale_x = resize_w / float(origin_shape[1])
return im_scale_y, im_scale_x
def image_preprocess(img_path, target_shape):
img = cv2.imread(img_path)
# Resize
im_scale_y, im_scale_x = generate_scale(img, target_shape)
img = cv2.resize(
padded_img = np.ones(input_size) * 114.0
img = np.array(image)
r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
resized_img = cv2.resize(
img,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=cv2.INTER_LINEAR)
# Pad
im_h, im_w = img.shape[:2]
h, w = target_shape[:]
if h != im_h or w != im_w:
canvas = np.ones((h, w, 3), dtype=np.float32)
canvas *= np.array([114.0, 114.0, 114.0], dtype=np.float32)
canvas[0:im_h, 0:im_w, :] = img.astype(np.float32)
img = canvas
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = np.transpose(img, [2, 0, 1]) / 255
img = np.expand_dims(img, 0)
scale_factor = np.array([[im_scale_y, im_scale_x]])
return img.astype(np.float32), scale_factor
(int(img.shape[1] * r), int(img.shape[0] * r)),
interpolation=cv2.INTER_LINEAR, ).astype(np.float32)
padded_img[:int(img.shape[0] * r), :int(img.shape[1] * r)] = resized_img
padded_img = padded_img[:, :, ::-1]
padded_img /= 255.0
if mean is not None:
padded_img -= mean
if std is not None:
padded_img /= std
padded_img = padded_img.transpose(swap)
padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
return padded_img, r
def get_color_map_list(num_classes):
......@@ -107,30 +137,77 @@ def get_color_map_list(num_classes):
return color_map
def draw_box(image_file, results, class_label, threshold=0.5):
srcimg = cv2.imread(image_file, 1)
for i in range(len(results)):
color_list = get_color_map_list(len(class_label))
clsid2color = {}
classid, conf = int(results[i, 0]), results[i, 1]
if conf < threshold:
def draw_box(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
color_list = get_color_map_list(len(class_names))
for i in range(len(boxes)):
box = boxes[i]
cls_id = int(cls_ids[i])
color = tuple(color_list[cls_id])
score = scores[i]
if score < conf:
continue
xmin, ymin, xmax, ymax = int(results[i, 2]), int(results[i, 3]), int(
results[i, 4]), int(results[i, 5])
x0 = int(box[0])
y0 = int(box[1])
x1 = int(box[2])
y1 = int(box[3])
if classid not in clsid2color:
clsid2color[classid] = color_list[classid]
color = tuple(clsid2color[classid])
text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
font = cv2.FONT_HERSHEY_SIMPLEX
cv2.rectangle(srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2)
print(class_label[classid] + ': ' + str(round(conf, 3)))
txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
cv2.rectangle(img, (x0, y0 + 1),
(x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])),
color, -1)
cv2.putText(
srcimg,
class_label[classid] + ':' + str(round(conf, 3)), (xmin, ymin - 10),
cv2.FONT_HERSHEY_SIMPLEX,
img,
text, (x0, y0 + txt_size[1]),
font,
0.8, (0, 255, 0),
thickness=2)
return srcimg
return img
def get_current_memory_mb():
"""
It is used to Obtain the memory usage of the CPU and GPU during the running of the program.
And this function Current program is time-consuming.
"""
try:
pkg.require('pynvml')
except:
from pip._internal import main
main(['install', 'pynvml'])
try:
pkg.require('psutil')
except:
from pip._internal import main
main(['install', 'psutil'])
try:
pkg.require('GPUtil')
except:
from pip._internal import main
main(['install', 'GPUtil'])
import pynvml
import psutil
import GPUtil
gpu_id = int(os.environ.get('CUDA_VISIBLE_DEVICES', 0))
pid = os.getpid()
p = psutil.Process(pid)
info = p.memory_full_info()
cpu_mem = info.uss / 1024. / 1024.
gpu_mem = 0
gpu_percent = 0
gpus = GPUtil.getGPUs()
if gpu_id is not None and len(gpus) > 0:
gpu_percent = gpus[gpu_id].load
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu_mem = meminfo.used / 1024. / 1024.
return round(cpu_mem, 4), round(gpu_mem, 4)
def load_predictor(model_dir,
......@@ -145,8 +222,7 @@ def load_predictor(model_dir,
trt_calib_mode=False,
cpu_threads=1,
enable_mkldnn=False,
enable_mkldnn_bfloat16=False,
delete_shuffle_pass=False):
enable_mkldnn_bfloat16=False):
"""set AnalysisConfig, generate AnalysisPredictor
Args:
model_dir (str): root path of __model__ and __params__
......@@ -158,8 +234,6 @@ def load_predictor(model_dir,
trt_opt_shape (int): opt shape for dynamic shape in trt
trt_calib_mode (bool): If the model is produced by TRT offline quantitative
calibration, trt_calib_mode need to set True
delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT.
Used by action model.
Returns:
predictor (PaddlePredictor): AnalysisPredictor
Raises:
......@@ -212,7 +286,7 @@ def load_predictor(model_dir,
use_calib_mode=trt_calib_mode)
if use_dynamic_shape:
dynamic_shape_file = os.path.join(args.model_path,
dynamic_shape_file = os.path.join(FLAGS.model_path,
'dynamic_shape.txt')
if os.path.exists(dynamic_shape_file):
config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
......@@ -223,31 +297,69 @@ def load_predictor(model_dir,
print('Start collect dynamic shape...')
rerun_flag = True
# disable print log when predict
config.disable_glog_info()
# enable shared memory
config.enable_memory_optim()
# disable feed, fetch OP, needed by zero_copy_run
config.switch_use_feed_fetch_ops(False)
if delete_shuffle_pass:
config.delete_pass("shuffle_channel_detect_pass")
predictor = create_predictor(config)
return predictor, rerun_flag
def predict_image(predictor,
image_file,
image_shape=[640, 640],
warmup=1,
repeats=1,
threshold=0.5,
arch='YOLOv5'):
img, scale_factor = image_preprocess(image_file, image_shape)
def eval(predictor, val_loader, anno_file, rerun_flag=False):
bboxes_list, bbox_nums_list, image_id_list = [], [], []
cpu_mems, gpu_mems = 0, 0
sample_nums = len(val_loader)
with tqdm(
total=sample_nums,
bar_format='Evaluation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}',
ncols=80) as t:
for data in val_loader:
data_all = {k: np.array(v) for k, v in data.items()}
inputs = {}
if FLAGS.arch == 'YOLOv6':
inputs['x2paddle_image_arrays'] = data_all['image']
else:
inputs['x2paddle_images'] = data_all['image']
input_names = predictor.get_input_names()
for i in range(len(input_names)):
input_tensor = predictor.get_input_handle(input_names[i])
input_tensor.copy_from_cpu(inputs[input_names[i]])
predictor.run()
output_names = predictor.get_output_names()
boxes_tensor = predictor.get_output_handle(output_names[0])
outs = boxes_tensor.copy_to_cpu()
if rerun_flag:
return
postprocess = YOLOPostProcess(
score_threshold=0.001, nms_threshold=0.65, multi_label=True)
res = postprocess(np.array(outs), data_all['scale_factor'])
bboxes_list.append(res['bbox'])
bbox_nums_list.append(res['bbox_num'])
image_id_list.append(np.array(data_all['im_id']))
cpu_mem, gpu_mem = get_current_memory_mb()
cpu_mems += cpu_mem
gpu_mems += gpu_mem
t.update()
print('Avg cpu_mem:{} MB, avg gpu_mem: {} MB'.format(
cpu_mems / sample_nums, gpu_mems / sample_nums))
coco_metric(anno_file, bboxes_list, bbox_nums_list, image_id_list)
def infer(predictor):
warmup, repeats = 1, 1
if FLAGS.benchmark:
warmup, repeats = 50, 100
origin_img = cv2.imread(FLAGS.image_file)
input_image, scale_factor = preprocess(origin_img,
[FLAGS.img_shape, FLAGS.img_shape])
input_image = np.expand_dims(input_image, axis=0)
scale_factor = np.array([[scale_factor, scale_factor]])
inputs = {}
if arch == 'YOLOv6':
inputs['x2paddle_image_arrays'] = img
if FLAGS.arch == 'YOLOv6':
inputs['x2paddle_image_arrays'] = input_image
else:
inputs['x2paddle_images'] = img
inputs['x2paddle_images'] = input_image
input_names = predictor.get_input_names()
for i in range(len(input_names)):
input_tensor = predictor.get_input_handle(input_names[i])
......@@ -260,6 +372,7 @@ def predict_image(predictor,
predict_time = 0.
time_min = float("inf")
time_max = float('-inf')
cpu_mems, gpu_mems = 0, 0
for i in range(repeats):
start_time = time.time()
predictor.run()
......@@ -271,6 +384,11 @@ def predict_image(predictor,
time_min = min(time_min, timed)
time_max = max(time_max, timed)
predict_time += timed
cpu_mem, gpu_mem = get_current_memory_mb()
cpu_mems += cpu_mem
gpu_mems += gpu_mem
print('Avg cpu_mem:{} MB, avg gpu_mem: {} MB'.format(cpu_mems / repeats,
gpu_mems / repeats))
time_avg = predict_time / repeats
print('Inference time(ms): min={}, max={}, avg={}'.format(
......@@ -279,62 +397,54 @@ def predict_image(predictor,
postprocess = YOLOPostProcess(
score_threshold=0.001, nms_threshold=0.65, multi_label=True)
res = postprocess(np_boxes, scale_factor)
res_img = draw_box(
image_file, res['bbox'], CLASS_LABEL, threshold=threshold)
cv2.imwrite('result.jpg', res_img)
# Draw rectangles and labels on the original image
dets = res['bbox']
if dets is not None:
final_boxes, final_scores, final_class = dets[:, 2:], dets[:,
1], dets[:,
0]
res_img = draw_box(
origin_img,
final_boxes,
final_scores,
final_class,
conf=0.5,
class_names=CLASS_LABEL)
cv2.imwrite('output.jpg', res_img)
print('The prediction results are saved in output.jpg.')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--image_file', type=str, default=None, help="image path")
parser.add_argument(
'--model_path', type=str, help="inference model filepath")
parser.add_argument(
'--benchmark',
type=bool,
default=False,
help="Whether run benchmark or not.")
parser.add_argument(
'--use_dynamic_shape',
type=bool,
default=True,
help="Whether use dynamic shape or not.")
parser.add_argument(
'--run_mode',
type=str,
default='paddle',
help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)")
parser.add_argument(
'--device',
type=str,
default='GPU',
help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU"
)
parser.add_argument(
'--arch', type=str, default='YOLOv5', help="architectures name.")
parser.add_argument('--img_shape', type=int, default=640, help="input_size")
args = parser.parse_args()
warmup, repeats = 1, 1
if args.benchmark:
warmup, repeats = 50, 100
def main():
predictor, rerun_flag = load_predictor(
args.model_path,
run_mode=args.run_mode,
device=args.device,
use_dynamic_shape=args.use_dynamic_shape)
predict_image(
predictor,
args.image_file,
image_shape=[args.img_shape, args.img_shape],
warmup=warmup,
repeats=repeats,
arch=args.arch)
FLAGS.model_path,
run_mode=FLAGS.run_mode,
device=FLAGS.device,
use_dynamic_shape=FLAGS.use_dynamic_shape)
if FLAGS.image_file:
infer(predictor)
else:
dataset = COCOValDataset(
dataset_dir=FLAGS.dataset_dir,
image_dir=FLAGS.val_image_dir,
anno_path=FLAGS.val_anno_path)
anno_file = dataset.ann_file
val_loader = paddle.io.DataLoader(
dataset, batch_size=FLAGS.batch_size, drop_last=True)
eval(predictor, val_loader, anno_file, rerun_flag=rerun_flag)
if rerun_flag:
print(
"***** Collect dynamic shape done, Please rerun the program to get correct results. *****"
)
if __name__ == '__main__':
paddle.enable_static()
parser = argsparser()
FLAGS = parser.parse_args()
# DataLoader need run on cpu
paddle.set_device('cpu')
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册