Support C++ TRT inference (#188)

* C++ inference: support TensorRT * Update README.md * add a yolov3 demo yaml

Support C++ TRT inference (#188)
* C++ inference: support TensorRT * Update README.md * add a yolov3 demo yaml
2c6fa076 · Bin Long · qingqing01 · 47741382 · 2c6fa076 · 2c6fa076
10 changed file
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -92,7 +92,7 @@ if (WIN32)
        add_definitions(-DSTATIC_LIB)
    endif()
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -o2 -std=c++11")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -o2 -fopenmp -std=c++11")
    set(CMAKE_STATIC_LIBRARY_PREFIX "")
 endif()

@@ -195,8 +195,8 @@ endif(NOT WIN32)
 if(WITH_GPU)
  if(NOT WIN32)
    if (USE_TENSORRT)
-      set(DEPS ${DEPS} ${PADDLE_DIR}/third_party/install/tensorrt/lib/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
-      set(DEPS ${DEPS} ${PADDLE_DIR}/third_party/install/tensorrt/lib/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${PADDLE_DIR}/third_party/install/tensorrt/lib/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${PADDLE_DIR}/third_party/install/tensorrt/lib/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
    endif()
    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
    set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX})

--- a/inference/README.md
+++ b/inference/README.md
@@ -123,7 +123,10 @@ DEPLOY:
    RESIZE_MAX_SIZE: 1333
    # 输入的tensor数量。
    FEEDS_SIZE: 3
-
+    # 是否开启TensorRT
+    USE_TRT: 0
+    # 如果开启TensorRT, 使用的精度, 支持FP16, FP32, INT8三个值
+    TRT_MODE: FP16
 ```
 修改字段`MODEL_PATH`的值为你在**上一步**下载并解压的模型文件所放置的目录即可。更多配置文件字段介绍，请参考文档[预测部署方案配置文件说明](./docs/configuration.md)。


--- a/inference/conf/detection_rcnn.yaml
+++ b/inference/conf/detection_rcnn.yaml
@@ -13,6 +13,8 @@ DEPLOY:
    CHANNELS : 3
    PRE_PROCESSOR: "DetectionPreProcessor"
    PREDICTOR_MODE: "ANALYSIS"
-    BATCH_SIZE : 1 
+    BATCH_SIZE : 1
    RESIZE_MAX_SIZE: 1333
    FEEDS_SIZE: 3
+    USE_TRT : 0
+    TRT_MODE : FP32
--- a/inference/conf/detection_rcnn_fpn.yaml
+++ b/inference/conf/detection_rcnn_fpn.yaml
@@ -17,3 +17,5 @@ DEPLOY:
    RESIZE_MAX_SIZE: 1333
    FEEDS_SIZE: 3
    COARSEST_STRIDE: 32
+    USE_TRT : 0
+    TRT_MODE : FP32
--- a/inference/conf/yolov3_mobilenet.yaml
+++ b/inference/conf/yolov3_mobilenet.yaml
+DEPLOY:
+    USE_GPU: 1
+    MODEL_PATH: "/root/projects/models/yolov3_mobilenet_v1/"
+    MODEL_FILENAME: "__model__"
+    PARAMS_FILENAME: "__params__"
+    EVAL_CROP_SIZE: (320, 320)
+    RESIZE_TYPE: "UNPADDING"
+    TARGET_SHORT_SIZE : 256
+    MEAN:  [0.4647, 0.4647, 0.4647]
+    STD: [0.0834, 0.0834, 0.0834]
+    IMAGE_TYPE: "rgb"
+    NUM_CLASSES: 1
+    CHANNELS : 3
+    PRE_PROCESSOR: "DetectionPreProcessor"
+    PREDICTOR_MODE: "ANALYSIS"
+    BATCH_SIZE : 3 
+    RESIZE_MAX_SIZE: -1
+    FEEDS_SIZE: 2
+    USE_TRT : 1 
+    TRT_MODE : "FP16"
--- a/inference/images/detection_rcnn/000000014439.jpg
+++ b/inference/images/detection_rcnn/000000014439.jpg
--- a/inference/images/detection_rcnn/000000014439_640x640.jpg
+++ b/inference/images/detection_rcnn/000000014439_640x640.jpg
--- a/inference/predictor/detection_predictor.cpp
+++ b/inference/predictor/detection_predictor.cpp
@@ -32,17 +32,18 @@ namespace PaddleSolution {
        int max_h = -1;
        int max_w = -1;
        for (int i = 0; i < batch_size; ++i) {
-            max_h = (max_h > resize_heights[i])? max_h:resize_heights[i];
-            max_w = (max_w > resize_widths[i])? max_w:resize_widths[i];
+            max_h = (max_h > resize_heights[i])? max_h : resize_heights[i];
+            max_w = (max_w > resize_widths[i])? max_w : resize_widths[i];
        }
+
        max_h = static_cast<int>(ceil(static_cast<float>(max_h)
-              / static_cast<float>(coarsest_stride)) * coarsest_stride);
+            / static_cast<float>(coarsest_stride)) * coarsest_stride);
        max_w = static_cast<int>(ceil(static_cast<float>(max_w)
-              / static_cast<float>(coarsest_stride)) * coarsest_stride);
-        std::cout << "max_w: " << max_w << " max_h: " << max_h << std::endl;
+            / static_cast<float>(coarsest_stride)) * coarsest_stride);
        input_buffer.insert(input_buffer.end(),
                            batch_size * channels * max_h * max_w, 0);
        // flatten tensor and padding
+        #pragma omp parallel for
        for (int i = 0; i < lod_buffer.size(); ++i) {
            float *input_buffer_ptr = input_buffer.data()
                                    + i * channels * max_h * max_w;
@@ -121,6 +122,8 @@ namespace PaddleSolution {
        }

        bool use_gpu = _model_config._use_gpu;
+        bool enable_trt = _model_config._enable_trt & use_gpu;
+        auto trt_precision = _model_config._trt_precision;
        const auto& model_dir = _model_config._model_path;
        const auto& model_filename = _model_config._model_file_name;
        const auto& params_filename = _model_config._param_file_name;
@@ -136,11 +139,17 @@ namespace PaddleSolution {
            config.use_gpu = use_gpu;
            config.device = 0;
            _main_predictor = paddle::CreatePaddlePredictor(config);
+
        } else if (_model_config._predictor_mode == "ANALYSIS") {
            paddle::AnalysisConfig config;
            if (use_gpu) {
                config.EnableUseGpu(100, 0);
            }
+            if (enable_trt) {
+                auto use_cab = (trt_precision == paddle::AnalysisConfig::Precision::kInt8);
+                config.EnableTensorRtEngine(1 << 20, _model_config._batch_size, 40,
+                    trt_precision, false, use_cab);
+            }
            auto prog_file = utils::path_join(model_dir, model_filename);
            auto param_file = utils::path_join(model_dir, params_filename);
            config.SetModel(prog_file, param_file);
@@ -288,7 +297,6 @@ namespace PaddleSolution {
           }
           feeds.push_back(im_size_tensor);
           _outputs.clear();
-
            auto t1 = std::chrono::high_resolution_clock::now();
            if (!_main_predictor->Run(feeds, &_outputs, batch_size)) {
            #ifdef _WIN32
@@ -376,7 +384,6 @@ namespace PaddleSolution {
                std::cout << "Failed to preprocess!" << std::endl;
                return -1;
            }
-
            // flatten tensor
            padding_minibatch(lod_buffer, input_buffer, resize_heights,
                              resize_widths, channels,
@@ -423,7 +430,6 @@ namespace PaddleSolution {
                im_size_tensor->Reshape({batch_size, 2});
                im_size_tensor->copy_from_cpu(image_size.data());
            }
-
            auto t1 = std::chrono::high_resolution_clock::now();
            _main_predictor->ZeroCopyRun();
            auto t2 = std::chrono::high_resolution_clock::now();

--- a/inference/preprocessor/preprocessor_detection.cpp
+++ b/inference/preprocessor/preprocessor_detection.cpp
@@ -62,16 +62,22 @@ bool DetectionPreProcessor::single_process(const std::string& fname,
    int rw = im.cols;
    int rh = im.rows;
    float im_scale_ratio;
-    utils::scaling(_config->_resize_type, rw, rh, _config->_resize[0],
-                   _config->_resize[1], _config->_target_short_size,
-                   _config->_resize_max_size, im_scale_ratio);
+    if (!_config->_enable_trt) {
+        utils::scaling(_config->_resize_type, rw, rh, _config->_resize[0],
+            _config->_resize[1], _config->_target_short_size,
+            _config->_resize_max_size, im_scale_ratio);
+    } else {
+        // TensorRT 5 only support fixed shape input
+        rw = _config->_resize[0];
+        rh = _config->_resize[1];
+    }
    cv::Size resize_size(rw, rh);
    *resize_w = rw;
    *resize_h = rh;
    *scale_ratio = im_scale_ratio;
    if (*ori_h != rh || *ori_w != rw) {
        cv::Mat im_temp;
-        if (_config->_resize_type == utils::SCALE_TYPE::UNPADDING) {
+        if (_config->_enable_trt || _config->_resize_type == utils::SCALE_TYPE::UNPADDING) {
            cv::resize(im, im_temp, resize_size, 0, 0, cv::INTER_LINEAR);
        } else if (_config->_resize_type == utils::SCALE_TYPE::RANGE_SCALING) {
                cv::resize(im, im_temp, cv::Size(), im_scale_ratio,
@@ -85,6 +91,7 @@ bool DetectionPreProcessor::single_process(const std::string& fname,

    float* pmean = _config->_mean.data();
    float* pscale = _config->_std.data();
+    #pragma omp parallel for
    for (int h = 0; h < rh; ++h) {
        const uchar* uptr = im.ptr<uchar>(h);
        const float* fptr = im.ptr<float>(h);

--- a/inference/utils/conf_parser.h
+++ b/inference/utils/conf_parser.h
@@ -18,6 +18,7 @@
 #include <vector>
 #include <string>
 #include <map>
+#include <paddle_inference_api.h>

 namespace PaddleSolution {

@@ -30,6 +31,7 @@ class PaddleModelConfigPaser {
        _channels(0),
        _use_gpu(0),
        _batch_size(1),
+        _enable_trt(false),
        _target_short_size(0),
        _model_file_name("__model__"),
        _param_file_name("__params__"),
@@ -58,6 +60,7 @@ class PaddleModelConfigPaser {
        _resize_max_size = 0;
        _feeds_size = 1;
         _coarsest_stride = 1;
+         _enable_trt = false;
    }

    std::string process_parenthesis(const std::string& str) {
@@ -214,6 +217,34 @@ class PaddleModelConfigPaser {
        if (config["DEPLOY"]["COARSEST_STRIDE"].IsDefined()) {
            _coarsest_stride = config["DEPLOY"]["COARSEST_STRIDE"].as<int>();
        }
+        // 20. enable_trt
+        if (config["DEPLOY"]["USE_TRT"].IsDefined()) {
+            _enable_trt = config["DEPLOY"]["USE_TRT"].as<int>();
+            _enable_trt &= _use_gpu;
+        } else {
+            _enable_trt = false;
+        }
+        if (_enable_trt) {
+            std::string trt_mode = "";
+            if (config["DEPLOY"]["TRT_MODE"].IsDefined()) {
+                trt_mode = config["DEPLOY"]["TRT_MODE"].as<std::string>();
+            } else {
+                trt_mode = "FP32";
+            }
+
+            if (trt_mode == "FP16") {
+                _trt_precision = paddle::AnalysisConfig::Precision::kHalf;
+            } else if (trt_mode == "FP32") {
+                _trt_precision = paddle::AnalysisConfig::Precision::kFloat32;
+            } else if (trt_mode == "INT8") {
+                _trt_precision = paddle::AnalysisConfig::Precision::kInt8;
+            } else {
+                _enable_trt = false;
+            }
+        }
+        if (_predictor_mode == "NATIVE") {
+            _enable_trt = false;
+        }
        return true;
    }

@@ -293,5 +324,9 @@ class PaddleModelConfigPaser {
    std::string _predictor_mode;
    // DEPLOY.BATCH_SIZE
    int _batch_size;
+    // bool enable_trt
+    bool _enable_trt;
+    // TRT Precision
+    paddle::AnalysisConfig::Precision _trt_precision;
 };
 }  // namespace PaddleSolution