update YOLO series paddle trt infer (#1400)

954bae8b · Guanghua Yu · GitHub · a77e2d68 · 954bae8b
隐藏空白更改
内联并排

Showing with 248 addition and 138 deletion

example/auto_compression/pytorch_yolo_series/paddle_trt_infer.py .../auto_compression/pytorch_yolo_series/paddle_trt_infer.py +248 -138

未找到文件。
--- a/example/auto_compression/pytorch_yolo_series/paddle_trt_infer.py
+++ b/example/auto_compression/pytorch_yolo_series/paddle_trt_infer.py
@@ -16,12 +16,70 @@ import os
 import cv2
 import numpy as np
 import argparse
+from tqdm import tqdm
+import pkg_resources as pkg
 import time
+import paddle
 from paddle.inference import Config
 from paddle.inference import create_predictor
+from dataset import COCOValDataset
+from post_process import YOLOPostProcess, coco_metric
+def argsparser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--model_path', type=str, help="inference model filepath")
+    parser.add_argument(
+        '--image_file',
+        type=str,
+        default=None,
+        help="image path, if set image_file, it will not eval coco.")
+    parser.add_argument(
+        '--dataset_dir',
+        type=str,
+        default='dataset/coco',
+        help="COCO dataset dir.")
+    parser.add_argument(
+        '--val_image_dir',
+        type=str,
+        default='val2017',
+        help="COCO dataset val image dir.")
+    parser.add_argument(
+        '--val_anno_path',
+        type=str,
+        default='annotations/instances_val2017.json',
+        help="COCO dataset anno path.")
+    parser.add_argument(
+        '--benchmark',
+        type=bool,
+        default=False,
+        help="Whether run benchmark or not.")
+    parser.add_argument(
+        '--use_dynamic_shape',
+        type=bool,
+        default=True,
+        help="Whether use dynamic shape or not.")
+    parser.add_argument(
+        '--run_mode',
+        type=str,
+        default='paddle',
+        help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)")
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU"
+    )
+    parser.add_argument(
+        '--arch', type=str, default='YOLOv5', help="architectures name.")
+    parser.add_argument('--img_shape', type=int, default=640, help="input_size")
+    parser.add_argument(
+        '--batch_size', type=int, default=1, help="Batch size of model input.")
+    return parser
-from post_process import YOLOPostProcess
 CLASS_LABEL = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
@@ -40,56 +98,28 @@ CLASS_LABEL = [
 ]
-def generate_scale(im, target_shape, keep_ratio=True):
+def preprocess(image, input_size, mean=None, std=None, swap=(2, 0, 1)):
-    """
+    if len(image.shape) == 3:
-        Args:
+        padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
-            im (np.ndarray): image (np.ndarray)
-        Returns:
-            im_scale_x: the resize ratio of X
-            im_scale_y: the resize ratio of Y
-        """
-    origin_shape = im.shape[:2]
-    if keep_ratio:
-        im_size_min = np.min(origin_shape)
-        im_size_max = np.max(origin_shape)
-        target_size_min = np.min(target_shape)
-        target_size_max = np.max(target_shape)
-        im_scale = float(target_size_min) / float(im_size_min)
-        if np.round(im_scale * im_size_max) > target_size_max:
-            im_scale = float(target_size_max) / float(im_size_max)
-        im_scale_x = im_scale
-        im_scale_y = im_scale
    else:
-        resize_h, resize_w = target_shape
+        padded_img = np.ones(input_size) * 114.0
-        im_scale_y = resize_h / float(origin_shape[0])
+    img = np.array(image)
-        im_scale_x = resize_w / float(origin_shape[1])
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
-    return im_scale_y, im_scale_x
+    resized_img = cv2.resize(
-def image_preprocess(img_path, target_shape):
-    img = cv2.imread(img_path)
-    # Resize
-    im_scale_y, im_scale_x = generate_scale(img, target_shape)
-    img = cv2.resize(
        img,
-        None,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        None,
+        interpolation=cv2.INTER_LINEAR, ).astype(np.float32)
-        fx=im_scale_x,
+    padded_img[:int(img.shape[0] * r), :int(img.shape[1] * r)] = resized_img
-        fy=im_scale_y,
-        interpolation=cv2.INTER_LINEAR)
+    padded_img = padded_img[:, :, ::-1]
-    # Pad
+    padded_img /= 255.0
-    im_h, im_w = img.shape[:2]
+    if mean is not None:
-    h, w = target_shape[:]
+        padded_img -= mean
-    if h != im_h or w != im_w:
+    if std is not None:
-        canvas = np.ones((h, w, 3), dtype=np.float32)
+        padded_img /= std
-        canvas *= np.array([114.0, 114.0, 114.0], dtype=np.float32)
+    padded_img = padded_img.transpose(swap)
-        canvas[0:im_h, 0:im_w, :] = img.astype(np.float32)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-        img = canvas
+    return padded_img, r
-    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    img = np.transpose(img, [2, 0, 1]) / 255
-    img = np.expand_dims(img, 0)
-    scale_factor = np.array([[im_scale_y, im_scale_x]])
-    return img.astype(np.float32), scale_factor
 def get_color_map_list(num_classes):
@@ -107,30 +137,77 @@ def get_color_map_list(num_classes):
    return color_map
-def draw_box(image_file, results, class_label, threshold=0.5):
+def draw_box(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
-    srcimg = cv2.imread(image_file, 1)
+    color_list = get_color_map_list(len(class_names))
-    for i in range(len(results)):
+    for i in range(len(boxes)):
-        color_list = get_color_map_list(len(class_label))
+        box = boxes[i]
-        clsid2color = {}
+        cls_id = int(cls_ids[i])
-        classid, conf = int(results[i, 0]), results[i, 1]
+        color = tuple(color_list[cls_id])
-        if conf < threshold:
+        score = scores[i]
+        if score < conf:
            continue
-        xmin, ymin, xmax, ymax = int(results[i, 2]), int(results[i, 3]), int(
+        x0 = int(box[0])
-            results[i, 4]), int(results[i, 5])
+        y0 = int(box[1])
+        x1 = int(box[2])
+        y1 = int(box[3])
-        if classid not in clsid2color:
+        text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
-            clsid2color[classid] = color_list[classid]
+        font = cv2.FONT_HERSHEY_SIMPLEX
-        color = tuple(clsid2color[classid])
-        cv2.rectangle(srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2)
+        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
-        print(class_label[classid] + ': ' + str(round(conf, 3)))
+        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+        cv2.rectangle(img, (x0, y0 + 1),
+                      (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])),
+                      color, -1)
        cv2.putText(
-            srcimg,
+            img,
-            class_label[classid] + ':' + str(round(conf, 3)), (xmin, ymin - 10),
+            text, (x0, y0 + txt_size[1]),
-            cv2.FONT_HERSHEY_SIMPLEX,
+            font,
            0.8, (0, 255, 0),
            thickness=2)
-    return srcimg
+    return img
+def get_current_memory_mb():
+    """
+    It is used to Obtain the memory usage of the CPU and GPU during the running of the program.
+    And this function Current program is time-consuming.
+    """
+    try:
+        pkg.require('pynvml')
+    except:
+        from pip._internal import main
+        main(['install', 'pynvml'])
+    try:
+        pkg.require('psutil')
+    except:
+        from pip._internal import main
+        main(['install', 'psutil'])
+    try:
+        pkg.require('GPUtil')
+    except:
+        from pip._internal import main
+        main(['install', 'GPUtil'])
+    import pynvml
+    import psutil
+    import GPUtil
+    gpu_id = int(os.environ.get('CUDA_VISIBLE_DEVICES', 0))
+    pid = os.getpid()
+    p = psutil.Process(pid)
+    info = p.memory_full_info()
+    cpu_mem = info.uss / 1024. / 1024.
+    gpu_mem = 0
+    gpu_percent = 0
+    gpus = GPUtil.getGPUs()
+    if gpu_id is not None and len(gpus) > 0:
+        gpu_percent = gpus[gpu_id].load
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        gpu_mem = meminfo.used / 1024. / 1024.
+    return round(cpu_mem, 4), round(gpu_mem, 4)
 def load_predictor(model_dir,
@@ -145,8 +222,7 @@ def load_predictor(model_dir,
                   trt_calib_mode=False,
                   cpu_threads=1,
                   enable_mkldnn=False,
-                   enable_mkldnn_bfloat16=False,
+                   enable_mkldnn_bfloat16=False):
-                   delete_shuffle_pass=False):
    """set AnalysisConfig, generate AnalysisPredictor
    Args:
        model_dir (str): root path of __model__ and __params__
@@ -158,8 +234,6 @@ def load_predictor(model_dir,
        trt_opt_shape (int): opt shape for dynamic shape in trt
        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
            calibration, trt_calib_mode need to set True
-        delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT. 
-                                    Used by action model.
    Returns:
        predictor (PaddlePredictor): AnalysisPredictor
    Raises:
@@ -212,7 +286,7 @@ def load_predictor(model_dir,
            use_calib_mode=trt_calib_mode)
        if use_dynamic_shape:
-            dynamic_shape_file = os.path.join(args.model_path,
+            dynamic_shape_file = os.path.join(FLAGS.model_path,
                                              'dynamic_shape.txt')
            if os.path.exists(dynamic_shape_file):
                config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
@@ -223,31 +297,69 @@ def load_predictor(model_dir,
                print('Start collect dynamic shape...')
                rerun_flag = True
-    # disable print log when predict
-    config.disable_glog_info()
    # enable shared memory
    config.enable_memory_optim()
    # disable feed, fetch OP, needed by zero_copy_run
    config.switch_use_feed_fetch_ops(False)
-    if delete_shuffle_pass:
-        config.delete_pass("shuffle_channel_detect_pass")
    predictor = create_predictor(config)
    return predictor, rerun_flag
-def predict_image(predictor,
+def eval(predictor, val_loader, anno_file, rerun_flag=False):
-                  image_file,
+    bboxes_list, bbox_nums_list, image_id_list = [], [], []
-                  image_shape=[640, 640],
+    cpu_mems, gpu_mems = 0, 0
-                  warmup=1,
+    sample_nums = len(val_loader)
-                  repeats=1,
+    with tqdm(
-                  threshold=0.5,
+            total=sample_nums,
-                  arch='YOLOv5'):
+            bar_format='Evaluation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}',
-    img, scale_factor = image_preprocess(image_file, image_shape)
+            ncols=80) as t:
+        for data in val_loader:
+            data_all = {k: np.array(v) for k, v in data.items()}
+            inputs = {}
+            if FLAGS.arch == 'YOLOv6':
+                inputs['x2paddle_image_arrays'] = data_all['image']
+            else:
+                inputs['x2paddle_images'] = data_all['image']
+            input_names = predictor.get_input_names()
+            for i in range(len(input_names)):
+                input_tensor = predictor.get_input_handle(input_names[i])
+                input_tensor.copy_from_cpu(inputs[input_names[i]])
+            predictor.run()
+            output_names = predictor.get_output_names()
+            boxes_tensor = predictor.get_output_handle(output_names[0])
+            outs = boxes_tensor.copy_to_cpu()
+            if rerun_flag:
+                return
+            postprocess = YOLOPostProcess(
+                score_threshold=0.001, nms_threshold=0.65, multi_label=True)
+            res = postprocess(np.array(outs), data_all['scale_factor'])
+            bboxes_list.append(res['bbox'])
+            bbox_nums_list.append(res['bbox_num'])
+            image_id_list.append(np.array(data_all['im_id']))
+            cpu_mem, gpu_mem = get_current_memory_mb()
+            cpu_mems += cpu_mem
+            gpu_mems += gpu_mem
+            t.update()
+    print('Avg cpu_mem:{} MB, avg gpu_mem: {} MB'.format(
+        cpu_mems / sample_nums, gpu_mems / sample_nums))
+    coco_metric(anno_file, bboxes_list, bbox_nums_list, image_id_list)
+def infer(predictor):
+    warmup, repeats = 1, 1
+    if FLAGS.benchmark:
+        warmup, repeats = 50, 100
+    origin_img = cv2.imread(FLAGS.image_file)
+    input_image, scale_factor = preprocess(origin_img,
+                                           [FLAGS.img_shape, FLAGS.img_shape])
+    input_image = np.expand_dims(input_image, axis=0)
+    scale_factor = np.array([[scale_factor, scale_factor]])
    inputs = {}
-    if arch == 'YOLOv6':
+    if FLAGS.arch == 'YOLOv6':
-        inputs['x2paddle_image_arrays'] = img
+        inputs['x2paddle_image_arrays'] = input_image
    else:
-        inputs['x2paddle_images'] = img
+        inputs['x2paddle_images'] = input_image
    input_names = predictor.get_input_names()
    for i in range(len(input_names)):
        input_tensor = predictor.get_input_handle(input_names[i])
@@ -260,6 +372,7 @@ def predict_image(predictor,
    predict_time = 0.
    time_min = float("inf")
    time_max = float('-inf')
+    cpu_mems, gpu_mems = 0, 0
    for i in range(repeats):
        start_time = time.time()
        predictor.run()
@@ -271,6 +384,11 @@ def predict_image(predictor,
        time_min = min(time_min, timed)
        time_max = max(time_max, timed)
        predict_time += timed
+        cpu_mem, gpu_mem = get_current_memory_mb()
+        cpu_mems += cpu_mem
+        gpu_mems += gpu_mem
+    print('Avg cpu_mem:{} MB, avg gpu_mem: {} MB'.format(cpu_mems / repeats,
+                                                         gpu_mems / repeats))
    time_avg = predict_time / repeats
    print('Inference time(ms): min={}, max={}, avg={}'.format(
@@ -279,62 +397,54 @@ def predict_image(predictor,
    postprocess = YOLOPostProcess(
        score_threshold=0.001, nms_threshold=0.65, multi_label=True)
    res = postprocess(np_boxes, scale_factor)
-    res_img = draw_box(
+    # Draw rectangles and labels on the original image
-        image_file, res['bbox'], CLASS_LABEL, threshold=threshold)
+    dets = res['bbox']
-    cv2.imwrite('result.jpg', res_img)
+    if dets is not None:
+        final_boxes, final_scores, final_class = dets[:, 2:], dets[:,
+                                                                   1], dets[:,
+                                                                            0]
+        res_img = draw_box(
+            origin_img,
+            final_boxes,
+            final_scores,
+            final_class,
+            conf=0.5,
+            class_names=CLASS_LABEL)
+        cv2.imwrite('output.jpg', res_img)
+        print('The prediction results are saved in output.jpg.')
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--image_file', type=str, default=None, help="image path")
-    parser.add_argument(
-        '--model_path', type=str, help="inference model filepath")
-    parser.add_argument(
-        '--benchmark',
-        type=bool,
-        default=False,
-        help="Whether run benchmark or not.")
-    parser.add_argument(
-        '--use_dynamic_shape',
-        type=bool,
-        default=True,
-        help="Whether use dynamic shape or not.")
-    parser.add_argument(
-        '--run_mode',
-        type=str,
-        default='paddle',
-        help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)")
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU"
-    )
-    parser.add_argument(
-        '--arch', type=str, default='YOLOv5', help="architectures name.")
-    parser.add_argument('--img_shape', type=int, default=640, help="input_size")
-    args = parser.parse_args()
-    warmup, repeats = 1, 1
-    if args.benchmark:
-        warmup, repeats = 50, 100
+def main():
    predictor, rerun_flag = load_predictor(
-        args.model_path,
+        FLAGS.model_path,
-        run_mode=args.run_mode,
+        run_mode=FLAGS.run_mode,
-        device=args.device,
+        device=FLAGS.device,
-        use_dynamic_shape=args.use_dynamic_shape)
+        use_dynamic_shape=FLAGS.use_dynamic_shape)
-    predict_image(
-        predictor,
+    if FLAGS.image_file:
-        args.image_file,
+        infer(predictor)
-        image_shape=[args.img_shape, args.img_shape],
+    else:
-        warmup=warmup,
+        dataset = COCOValDataset(
-        repeats=repeats,
+            dataset_dir=FLAGS.dataset_dir,
-        arch=args.arch)
+            image_dir=FLAGS.val_image_dir,
+            anno_path=FLAGS.val_anno_path)
+        anno_file = dataset.ann_file
+        val_loader = paddle.io.DataLoader(
+            dataset, batch_size=FLAGS.batch_size, drop_last=True)
+        eval(predictor, val_loader, anno_file, rerun_flag=rerun_flag)
    if rerun_flag:
        print(
            "***** Collect dynamic shape done, Please rerun the program to get correct results. *****"
        )
+if __name__ == '__main__':
+    paddle.enable_static()
+    parser = argsparser()
+    FLAGS = parser.parse_args()
+    # DataLoader need run on cpu
+    paddle.set_device('cpu')
+    main()