[cherry-pick]fix batch_size when trt infer (#3104)

* fix batch_size when trt infer

[cherry-pick]fix batch_size when trt infer (#3104)
* fix batch_size when trt infer
01cb2ee8 · Guanghua Yu · GitHub · ab6d3c53 · 01cb2ee8 · 01cb2ee8
7 changed file
--- a/deploy/cpp/docs/Jetson_build.md
+++ b/deploy/cpp/docs/Jetson_build.md
@@ -162,6 +162,7 @@ CUDNN_LIB=/usr/lib/aarch64-linux-gnu/
 | --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
+| --batch_size |预测时的batch size，在指定`image_dir`时有效 |
 | --run_benchmark | 是否重复预测来进行benchmark测速 ｜
 | --output_dir | 输出图片所在的文件夹, 默认为output ｜
 | --use_mkldnn | CPU预测中是否开启MKLDNN加速 |

--- a/deploy/cpp/docs/linux_build.md
+++ b/deploy/cpp/docs/linux_build.md
@@ -104,6 +104,7 @@ make
 | --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
+| --batch_size  | 预测时的batch size，在指定`image_dir`时有效 |
 | --run_benchmark | 是否重复预测来进行benchmark测速 ｜
 | --output_dir | 输出图片所在的文件夹, 默认为output ｜
 | --use_mkldnn | CPU预测中是否开启MKLDNN加速 |

--- a/deploy/cpp/docs/windows_vs2019_build.md
+++ b/deploy/cpp/docs/windows_vs2019_build.md
@@ -54,7 +54,7 @@ cd D:\projects\PaddleDetection\deploy\cpp
 2. 使用CMake生成项目文件
-编译参数的含义说明如下（带*表示仅在使用**GPU版本**预测库时指定, 其中CUDA库版本尽量对齐，**使用9.0、10.0版本，不使用9.2、10.1等版本CUDA库**）：
+编译参数的含义说明如下（带`*`表示仅在使用**GPU版本**预测库时指定, 其中CUDA库版本尽量对齐，**使用9.0、10.0版本，不使用9.2、10.1等版本CUDA库**）：
 |  参数名   | 含义  |
 |  ----  | ----  |
@@ -99,6 +99,7 @@ cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release
 | --use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
 | --gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode | 使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
+| --batch_size  | 预测时的batch size，在指定`image_dir`时有效 |
 | --run_benchmark | 是否重复预测来进行benchmark测速 |
 | --output_dir | 输出图片所在的文件夹, 默认为output |
 | --use_mkldnn | CPU预测中是否开启MKLDNN加速 |

--- a/deploy/cpp/include/object_detector.h
+++ b/deploy/cpp/include/object_detector.h
@@ -62,6 +62,7 @@ class ObjectDetector {
                          bool use_mkldnn=false,
                          int cpu_threads=1,
                          const std::string& run_mode="fluid",
+                          const int batch_size=1,
                          const int gpu_id=0,
                          bool use_dynamic_shape=false,
                          const int trt_min_shape=1,
@@ -83,7 +84,7 @@ class ObjectDetector {
    threshold_ = config_.draw_threshold_;
    image_shape_ = config_.image_shape_;
    preprocessor_.Init(config_.preprocess_info_, image_shape_);
-    LoadModel(model_dir, 1, run_mode);
+    LoadModel(model_dir, batch_size, run_mode);
  }
  // Load Paddle inference model

--- a/deploy/cpp/src/main.cc
+++ b/deploy/cpp/src/main.cc
@@ -72,7 +72,7 @@ void PrintBenchmarkLog(std::vector<double> det_time, int img_num){
  LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False");
  LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_threads;
  LOG(INFO) << "----------------------- Data info -----------------------";
-  LOG(INFO) << "batch_size: " << 1;
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
  LOG(INFO) << "input_shape: " << "dynamic shape";
  LOG(INFO) << "----------------------- Model info -----------------------";
  FLAGS_model_dir.erase(FLAGS_model_dir.find_last_not_of("/") + 1);
@@ -332,7 +332,7 @@ void PredictImage(const std::vector<std::string> all_img_paths,
        if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
          output_path += OS_PATH_SEP;
        }
-        std::string image_file_path = all_img_paths.at(idx * batch_size+bs);
+        std::string image_file_path = all_img_paths.at(idx * batch_size + bs);
        output_path += image_file_path.substr(image_file_path.find_last_of('/') + 1);
        cv::imwrite(output_path, vis_img, compression_params);
        printf("Visualized output saved as %s\n", output_path.c_str());
@@ -361,7 +361,7 @@ int main(int argc, char** argv) {
  }
  // Load model and create a object detector
  PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_use_gpu, FLAGS_use_mkldnn,
-                        FLAGS_cpu_threads, FLAGS_run_mode, FLAGS_gpu_id, FLAGS_use_dynamic_shape,
+                        FLAGS_cpu_threads, FLAGS_run_mode, FLAGS_batch_size,FLAGS_gpu_id, FLAGS_use_dynamic_shape,
                        FLAGS_trt_min_shape, FLAGS_trt_max_shape, FLAGS_trt_opt_shape, FLAGS_trt_calib_mode);
  // Do inference on input video or image
  if (!FLAGS_video_file.empty() || FLAGS_camera_id != -1) {

--- a/deploy/python/README.md
+++ b/deploy/python/README.md
@@ -35,6 +35,7 @@ python deploy/python/infer.py --model_dir=./inference/yolov3_mobilenet_v1_roadsi
 | --camera_id | Option | 用来预测的摄像头ID，默认为-1(表示不使用摄像头预测，可设置为：0 - (摄像头数目-1) )，预测过程中在可视化界面按`q`退出输出预测结果到：output/output.mp4|
 | --use_gpu | No |是否GPU，默认为False|
 | --run_mode | No |使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16/trt_int8）|
+| --batch_size | No |预测时的batch size，在指定`image_dir`时有效 |
 | --threshold | No|预测得分的阈值，默认为0.5|
 | --output_dir | No|可视化结果保存的根目录，默认为output/|
 | --run_benchmark | No| 是否运行benchmark，同时需指定`--image_file`或`--image_dir` |

--- a/deploy/python/infer.py
+++ b/deploy/python/infer.py
@@ -50,6 +50,7 @@ class Detector(object):
        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
        use_gpu (bool): whether use gpu
        run_mode (str): mode of running(fluid/trt_fp32/trt_fp16)
+        batch_size (int): size of pre batch in inference
        use_dynamic_shape (bool): use dynamic shape or not
        trt_min_shape (int): min shape for dynamic shape in trt
        trt_max_shape (int): max shape for dynamic shape in trt
@@ -63,6 +64,7 @@ class Detector(object):
                 model_dir,
                 use_gpu=False,
                 run_mode='fluid',
+                 batch_size=1,
                 use_dynamic_shape=False,
                 trt_min_shape=1,
                 trt_max_shape=1280,
@@ -74,6 +76,7 @@ class Detector(object):
        self.predictor, self.config = load_predictor(
            model_dir,
            run_mode=run_mode,
+            batch_size=batch_size,
            min_subgraph_size=self.pred_config.min_subgraph_size,
            use_gpu=use_gpu,
            use_dynamic_shape=use_dynamic_shape,
@@ -186,6 +189,7 @@ class DetectorSOLOv2(Detector):
        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
        use_gpu (bool): whether use gpu
        run_mode (str): mode of running(fluid/trt_fp32/trt_fp16)
+        batch_size (int): size of pre batch in inference
        use_dynamic_shape (bool): use dynamic shape or not
        trt_min_shape (int): min shape for dynamic shape in trt
        trt_max_shape (int): max shape for dynamic shape in trt
@@ -198,6 +202,7 @@ class DetectorSOLOv2(Detector):
                 model_dir,
                 use_gpu=False,
                 run_mode='fluid',
+                 batch_size=1,
                 use_dynamic_shape=False,
                 trt_min_shape=1,
                 trt_max_shape=1280,
@@ -209,6 +214,7 @@ class DetectorSOLOv2(Detector):
        self.predictor, self.config = load_predictor(
            model_dir,
            run_mode=run_mode,
+            batch_size=batch_size,
            min_subgraph_size=self.pred_config.min_subgraph_size,
            use_gpu=use_gpu,
            use_dynamic_shape=use_dynamic_shape,
@@ -568,6 +574,7 @@ def main():
        FLAGS.model_dir,
        use_gpu=FLAGS.use_gpu,
        run_mode=FLAGS.run_mode,
+        batch_size=FLAGS.batch_size,
        use_dynamic_shape=FLAGS.use_dynamic_shape,
        trt_min_shape=FLAGS.trt_min_shape,
        trt_max_shape=FLAGS.trt_max_shape,
@@ -581,6 +588,7 @@ def main():
            FLAGS.model_dir,
            use_gpu=FLAGS.use_gpu,
            run_mode=FLAGS.run_mode,
+            batch_size=FLAGS.batch_size,
            use_dynamic_shape=FLAGS.use_dynamic_shape,
            trt_min_shape=FLAGS.trt_min_shape,
            trt_max_shape=FLAGS.trt_max_shape,
@@ -615,7 +623,7 @@ def main():
                'precision': mode.split('_')[-1]
            }
            data_info = {
-                'batch_size': 1,
+                'batch_size': FLAGS.batch_size,
                'shape': "dynamic_shape",
                'data_num': perf_info['img_num']
            }