fix detection infer (#1751)

64ebffc4 · xiaoluomi · GitHub · da3ef32e · 64ebffc4 · 64ebffc4
2 changed file
--- a/example/auto_compression/detection/configs/rtdetr_reader.yml
+++ b/example/auto_compression/detection/configs/rtdetr_reader.yml
@@ -12,6 +12,18 @@ TrainDataset:
    anno_path: annotations/instances_val2017.json
    dataset_dir: dataset/coco/
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco/
+TestDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco/
 worker_num: 0
 # preprocess reader in test

--- a/example/auto_compression/detection/paddle_inference_eval.py
+++ b/example/auto_compression/detection/paddle_inference_eval.py
@@ -64,7 +64,8 @@ def argsparser():
        "--device",
        type=str,
        default="GPU",
-        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU",
+        help=
+        "Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU",
    )
    parser.add_argument(
        "--use_dynamic_shape",
@@ -270,8 +271,8 @@ def load_predictor(
            dynamic_shape_file = os.path.join(FLAGS.model_path,
                                              "dynamic_shape.txt")
            if os.path.exists(dynamic_shape_file):
-                config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
+                config.enable_tuned_tensorrt_dynamic_shape(
-                                                           True)
+                    dynamic_shape_file, True)
                print("trt set dynamic shape done!")
            else:
                config.collect_shape_range_info(dynamic_shape_file)
@@ -284,48 +285,6 @@ def load_predictor(
    return predictor, rerun_flag
-def get_current_memory_mb():
-    """
-    It is used to Obtain the memory usage of the CPU and GPU during the running of the program.
-    And this function Current program is time-consuming.
-    """
-    try:
-        pkg.require('pynvml')
-    except:
-        from pip._internal import main
-        main(['install', 'pynvml'])
-    try:
-        pkg.require('psutil')
-    except:
-        from pip._internal import main
-        main(['install', 'psutil'])
-    try:
-        pkg.require('GPUtil')
-    except:
-        from pip._internal import main
-        main(['install', 'GPUtil'])
-    import pynvml
-    import psutil
-    import GPUtil
-    gpu_id = int(os.environ.get("CUDA_VISIBLE_DEVICES", 0))
-    pid = os.getpid()
-    p = psutil.Process(pid)
-    info = p.memory_full_info()
-    cpu_mem = info.uss / 1024.0 / 1024.0
-    gpu_mem = 0
-    gpu_percent = 0
-    gpus = GPUtil.getGPUs()
-    if gpu_id is not None and len(gpus) > 0:
-        gpu_percent = gpus[gpu_id].load
-        pynvml.nvmlInit()
-        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        gpu_mem = meminfo.used / 1024.0 / 1024.0
-    return round(cpu_mem, 4), round(gpu_mem, 4)
 def predict_image(predictor,
                  image_file,
                  image_shape=[640, 640],
@@ -353,6 +312,7 @@ def predict_image(predictor,
    predict_time = 0.0
    time_min = float("inf")
    time_max = float("-inf")
+    paddle.device.cuda.synchronize()
    for i in range(repeats):
        start_time = time.time()
        predictor.run()
@@ -367,13 +327,8 @@ def predict_image(predictor,
        time_min = min(time_min, timed)
        time_max = max(time_max, timed)
        predict_time += timed
-        cpu_mem, gpu_mem = get_current_memory_mb()
-        cpu_mems += cpu_mem
-        gpu_mems += gpu_mem
    time_avg = predict_time / repeats
-    print("[Benchmark]Avg cpu_mem:{} MB, avg gpu_mem: {} MB".format(
-        cpu_mems / repeats, gpu_mems / repeats))
    print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format(
        round(time_min * 1000, 2),
        round(time_max * 1000, 1), round(time_avg * 1000, 1)))
@@ -406,6 +361,7 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
        for i, _ in enumerate(input_names):
            input_tensor = predictor.get_input_handle(input_names[i])
            input_tensor.copy_from_cpu(data_all[input_names[i]])
+        paddle.device.cuda.synchronize()
        start_time = time.time()
        predictor.run()
        np_boxes = boxes_tensor.copy_to_cpu()
@@ -418,9 +374,6 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
        time_min = min(time_min, timed)
        time_max = max(time_max, timed)
        predict_time += timed
-        cpu_mem, gpu_mem = get_current_memory_mb()
-        cpu_mems += cpu_mem
-        gpu_mems += gpu_mem
        if not FLAGS.include_nms:
            postprocess = PPYOLOEPostProcess(
                score_threshold=0.3, nms_threshold=0.6)
@@ -436,8 +389,6 @@ def eval(predictor, val_loader, metric, rerun_flag=False):
    map_res = metric.get_results()
    metric.reset()
    time_avg = predict_time / sample_nums
-    print("[Benchmark]Avg cpu_mem:{} MB, avg gpu_mem: {} MB".format(
-        cpu_mems / sample_nums, gpu_mems / sample_nums))
    print("[Benchmark]Inference time(ms): min={}, max={}, avg={}".format(
        round(time_min * 1000, 2),
        round(time_max * 1000, 1), round(time_avg * 1000, 1)))
@@ -473,7 +424,8 @@ def main():
        dataset = reader_cfg["EvalDataset"]
        global val_loader
-        val_loader = create("EvalReader")(reader_cfg["EvalDataset"],
+        val_loader = create("EvalReader")(
+            reader_cfg["EvalDataset"],
            reader_cfg["worker_num"],
            return_list=True)
        clsid2catid = {v: k for k, v in dataset.catid2clsid.items()}