diff --git a/dygraph/deploy/cpp/include/object_detector.h b/dygraph/deploy/cpp/include/object_detector.h index a92be7ffc99ce363aa9c1537e56b3c1ce1aab207..4c1846a249df12d549b1b3bf28e28d879377eaf6 100644 --- a/dygraph/deploy/cpp/include/object_detector.h +++ b/dygraph/deploy/cpp/include/object_detector.h @@ -59,12 +59,17 @@ class ObjectDetector { explicit ObjectDetector(const std::string& model_dir, bool use_gpu=false, const std::string& run_mode="fluid", - const int gpu_id=0) { + const int gpu_id=0, + bool use_dynamic_shape=false, + const int trt_min_shape=1, + const int trt_max_shape=1280, + const int trt_opt_shape=640) { config_.load_config(model_dir); threshold_ = config_.draw_threshold_; image_shape_ = config_.image_shape_; preprocessor_.Init(config_.preprocess_info_, image_shape_); - LoadModel(model_dir, use_gpu, config_.min_subgraph_size_, 1, run_mode, gpu_id); + LoadModel(model_dir, use_gpu, config_.min_subgraph_size_, 1, run_mode, gpu_id, + use_dynamic_shape, trt_min_shape, trt_max_shape, trt_opt_shape); } // Load Paddle inference model @@ -74,7 +79,11 @@ class ObjectDetector { const int min_subgraph_size, const int batch_size = 1, const std::string& run_mode = "fluid", - const int gpu_id=0); + const int gpu_id=0, + bool use_dynamic_shape=false, + const int trt_min_shape=1, + const int trt_max_shape=1280, + const int trt_opt_shape=640); // Run predictor void Predict(const cv::Mat& im, diff --git a/dygraph/deploy/cpp/include/preprocess_op.h b/dygraph/deploy/cpp/include/preprocess_op.h index fb840c8689e466f9a477fbfdda71360e2124b4a9..e48639b553867dff859cef4816558df7654edcfb 100644 --- a/dygraph/deploy/cpp/include/preprocess_op.h +++ b/dygraph/deploy/cpp/include/preprocess_op.h @@ -37,8 +37,8 @@ class ImageBlob { std::vector im_shape_; // Buffer for image data after preprocessing std::vector im_data_; - // input image width, height - std::vector input_shape_; + // in net data shape(after pad) + std::vector in_net_shape_; // Evaluation image width and height //std::vector eval_im_size_f_; // Scale factor for image size to origin image size @@ -90,7 +90,7 @@ class Resize : public PreprocessOp { keep_ratio_ = item["keep_ratio"].as(); target_size_ = item["target_size"].as>(); if (item["keep_ratio"]) { - input_shape_ = image_shape; + in_net_shape_ = image_shape; } } @@ -103,7 +103,7 @@ class Resize : public PreprocessOp { int interp_; bool keep_ratio_; std::vector target_size_; - std::vector input_shape_; + std::vector in_net_shape_; }; // Models with FPN need input shape % stride == 0 diff --git a/dygraph/deploy/cpp/scripts/build.sh b/dygraph/deploy/cpp/scripts/build.sh index be11362881d2e5cc726629c21502d28186be4d40..a32b1d383256f0d775dba41d16b99468558e9135 100644 --- a/dygraph/deploy/cpp/scripts/build.sh +++ b/dygraph/deploy/cpp/scripts/build.sh @@ -35,11 +35,11 @@ then echo "set OPENCV_DIR for x86_64" # linux系统通过以下命令下载预编译的opencv mkdir -p $(pwd)/deps && cd $(pwd)/deps - wget -c https://bj.bcebos.com/paddleseg/deploy/opencv3.4.6gcc4.8ffmpeg.tar.gz2 - tar xvfj opencv3.4.6gcc4.8ffmpeg.tar.gz2 && cd .. + wget -c https://paddledet.bj.bcebos.com/data/opencv3.4.6gcc8.2ffmpeg.zip + unzip opencv3.4.6gcc8.2ffmpeg.zip && cd .. # set OPENCV_DIR - OPENCV_DIR=$(pwd)/deps/opencv3.4.6gcc4.8ffmpeg/ + OPENCV_DIR=$(pwd)/deps/opencv3.4.6gcc8.2ffmpeg elif [ "$MACHINE_TYPE" = "aarch64" ] then diff --git a/dygraph/deploy/cpp/src/main.cc b/dygraph/deploy/cpp/src/main.cc index 92ecb32061f6a12497af88c8c24532ef41508ea1..6fb49769288f12f7c71345f03f689d6a67757f19 100644 --- a/dygraph/deploy/cpp/src/main.cc +++ b/dygraph/deploy/cpp/src/main.cc @@ -42,6 +42,10 @@ DEFINE_int32(camera_id, -1, "Device id of camera to predict"); DEFINE_bool(run_benchmark, false, "Whether to predict a image_file repeatedly for benchmark"); DEFINE_double(threshold, 0.5, "Threshold of score."); DEFINE_string(output_dir, "output", "Directory of output visualization files."); +DEFINE_bool(use_dynamic_shape, false, "Trt use dynamic shape or not"); +DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI"); +DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI"); +DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI"); static std::string DirName(const std::string &filepath) { auto pos = filepath.rfind(OS_PATH_SEP); @@ -198,14 +202,17 @@ int main(int argc, char** argv) { return -1; } if (!(FLAGS_run_mode == "fluid" || FLAGS_run_mode == "trt_fp32" - || FLAGS_run_mode == "trt_fp16")) { - std::cout << "run_mode should be 'fluid', 'trt_fp32' or 'trt_fp16'."; + || FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) { + std::cout << "run_mode should be 'fluid', 'trt_fp32', 'trt_fp16' or 'trt_int8'."; return -1; } // Load model and create a object detector - - PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_use_gpu, - FLAGS_run_mode, FLAGS_gpu_id); + const std::vector trt_min_shape = {1, FLAGS_trt_min_shape, FLAGS_trt_min_shape}; + const std::vector trt_max_shape = {1, FLAGS_trt_max_shape, FLAGS_trt_max_shape}; + const std::vector trt_opt_shape = {1, FLAGS_trt_opt_shape, FLAGS_trt_opt_shape}; + PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_use_gpu, FLAGS_run_mode, + FLAGS_gpu_id, FLAGS_use_dynamic_shape, FLAGS_trt_min_shape, + FLAGS_trt_max_shape, FLAGS_trt_opt_shape); // Do inference on input video or image if (!FLAGS_video_path.empty() || FLAGS_use_camera) { PredictVideo(FLAGS_video_path, &det); diff --git a/dygraph/deploy/cpp/src/object_detector.cc b/dygraph/deploy/cpp/src/object_detector.cc index 5faa83a2eb3fb382fc98d83d687fb0082b5dfa8b..f629acd8ce0ad4b77b0fd85d6280c8528d876d68 100644 --- a/dygraph/deploy/cpp/src/object_detector.cc +++ b/dygraph/deploy/cpp/src/object_detector.cc @@ -27,7 +27,11 @@ void ObjectDetector::LoadModel(const std::string& model_dir, const int min_subgraph_size, const int batch_size, const std::string& run_mode, - const int gpu_id) { + const int gpu_id, + bool use_dynamic_shape, + const int trt_min_shape, + const int trt_max_shape, + const int trt_opt_shape) { paddle_infer::Config config; std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel"; std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams"; @@ -35,26 +39,48 @@ void ObjectDetector::LoadModel(const std::string& model_dir, if (use_gpu) { config.EnableUseGpu(200, gpu_id); config.SwitchIrOptim(true); + // use tensorrt + bool use_calib_mode = false; if (run_mode != "fluid") { auto precision = paddle_infer::Config::Precision::kFloat32; - if (run_mode == "trt_fp16") { + if (run_mode == "trt_fp32") { + precision = paddle_infer::Config::Precision::kFloat32; + } + else if (run_mode == "trt_fp16") { precision = paddle_infer::Config::Precision::kHalf; - } else if (run_mode == "trt_int8") { - printf("TensorRT int8 mode is not supported now, " - "please use 'trt_fp32' or 'trt_fp16' instead"); + } + else if (run_mode == "trt_int8") { + precision = paddle_infer::Config::Precision::kInt8; + use_calib_mode = true; } else { - if (run_mode != "trt_fp32") { - printf("run_mode should be 'fluid', 'trt_fp32' or 'trt_fp16'"); - } + printf("run_mode should be 'fluid', 'trt_fp32', 'trt_fp16' or 'trt_int8'"); } + // set tensorrt config.EnableTensorRtEngine( - 1 << 10, + 1 << 30, batch_size, min_subgraph_size, precision, false, - false); - } + use_calib_mode); + + // set use dynamic shape + if (use_dynamic_shape) { + // set DynamicShsape for image tensor + const std::vector min_input_shape = {1, trt_min_shape, trt_min_shape}; + const std::vector max_input_shape = {1, trt_max_shape, trt_max_shape}; + const std::vector opt_input_shape = {1, trt_opt_shape, trt_opt_shape}; + const std::map> map_min_input_shape = {{"image", min_input_shape}}; + const std::map> map_max_input_shape = {{"image", max_input_shape}}; + const std::map> map_opt_input_shape = {{"image", opt_input_shape}}; + + config.SetTRTDynamicShapeInfo(map_min_input_shape, + map_max_input_shape, + map_opt_input_shape); + std::cout << "TensorRT dynamic shape enabled" << std::endl; + } + } + } else { config.DisableGpu(); } @@ -171,8 +197,8 @@ void ObjectDetector::Predict(const cv::Mat& im, for (const auto& tensor_name : input_names) { auto in_tensor = predictor_->GetInputHandle(tensor_name); if (tensor_name == "image") { - int rh = inputs_.input_shape_[0]; - int rw = inputs_.input_shape_[1]; + int rh = inputs_.in_net_shape_[0]; + int rw = inputs_.in_net_shape_[1]; in_tensor->Reshape({1, 3, rh, rw}); in_tensor->CopyFromCpu(inputs_.im_data_.data()); } else if (tensor_name == "im_shape") { diff --git a/dygraph/deploy/cpp/src/preprocess_op.cc b/dygraph/deploy/cpp/src/preprocess_op.cc index 462958174ba75f1c7514612f18baf4c85b480283..8edd3eb1f2b7957649f7075bbcd20ba582c841a7 100644 --- a/dygraph/deploy/cpp/src/preprocess_op.cc +++ b/dygraph/deploy/cpp/src/preprocess_op.cc @@ -25,7 +25,7 @@ void InitInfo::Run(cv::Mat* im, ImageBlob* data) { static_cast(im->cols) }; data->scale_factor_ = {1., 1.}; - data->input_shape_ = { + data->in_net_shape_ = { static_cast(im->rows), static_cast(im->cols) }; @@ -62,7 +62,11 @@ void Permute::Run(cv::Mat* im, ImageBlob* data) { void Resize::Run(cv::Mat* im, ImageBlob* data) { auto resize_scale = GenerateScale(*im); - data->input_shape_ = { + data->im_shape_ = { + static_cast(im->cols * resize_scale.first), + static_cast(im->rows * resize_scale.second) + }; + data->in_net_shape_ = { static_cast(im->cols * resize_scale.first), static_cast(im->rows * resize_scale.second) }; @@ -121,7 +125,7 @@ void PadStride::Run(cv::Mat* im, ImageBlob* data) { nw - rw, cv::BORDER_CONSTANT, cv::Scalar(0)); - data->input_shape_ = { + data->in_net_shape_ = { static_cast(im->rows), static_cast(im->cols), }; diff --git a/dygraph/deploy/python/infer.py b/dygraph/deploy/python/infer.py index 4d9feb214d8b00dc9e6e50619b49afa38c6e6b79..b10576132b6d3f979f59ec720586ccee5f7534e2 100644 --- a/dygraph/deploy/python/infer.py +++ b/dygraph/deploy/python/infer.py @@ -46,6 +46,11 @@ class Detector(object): model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml use_gpu (bool): whether use gpu run_mode (str): mode of running(fluid/trt_fp32/trt_fp16) + use_dynamic_shape (bool): use dynamic shape or not + trt_min_shape (int): min shape for dynamic shape in trt + trt_max_shape (int): max shape for dynamic shape in trt + trt_opt_shape (int): opt shape for dynamic shape in trt + run_mode (str): mode of running(fluid/trt_fp32/trt_fp16) threshold (float): threshold to reserve the result for output. """ @@ -54,13 +59,21 @@ class Detector(object): model_dir, use_gpu=False, run_mode='fluid', + use_dynamic_shape=False, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, threshold=0.5): self.pred_config = pred_config self.predictor = load_predictor( model_dir, run_mode=run_mode, min_subgraph_size=self.pred_config.min_subgraph_size, - use_gpu=use_gpu) + use_gpu=use_gpu, + use_dynamic_shape=use_dynamic_shape, + trt_min_shape=trt_min_shape, + trt_max_shape=trt_max_shape, + trt_opt_shape=trt_opt_shape) def preprocess(self, im): preprocess_ops = [] @@ -154,6 +167,10 @@ class DetectorSOLOv2(Detector): model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml use_gpu (bool): whether use gpu run_mode (str): mode of running(fluid/trt_fp32/trt_fp16) + use_dynamic_shape (bool): use dynamic shape or not + trt_min_shape (int): min shape for dynamic shape in trt + trt_max_shape (int): max shape for dynamic shape in trt + trt_opt_shape (int): opt shape for dynamic shape in trt threshold (float): threshold to reserve the result for output. """ @@ -162,13 +179,21 @@ class DetectorSOLOv2(Detector): model_dir, use_gpu=False, run_mode='fluid', + use_dynamic_shape=False, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, threshold=0.5): self.pred_config = pred_config self.predictor = load_predictor( model_dir, run_mode=run_mode, min_subgraph_size=self.pred_config.min_subgraph_size, - use_gpu=use_gpu) + use_gpu=use_gpu, + use_dynamic_shape=use_dynamic_shape, + trt_min_shape=trt_min_shape, + trt_max_shape=trt_max_shape, + trt_opt_shape=trt_opt_shape) def predict(self, image, @@ -287,11 +312,20 @@ def load_predictor(model_dir, run_mode='fluid', batch_size=1, use_gpu=False, - min_subgraph_size=3): + min_subgraph_size=3, + use_dynamic_shape=False, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640): """set AnalysisConfig, generate AnalysisPredictor Args: model_dir (str): root path of __model__ and __params__ use_gpu (bool): whether use gpu + run_mode (str): mode of running(fluid/trt_fp32/trt_fp16) + use_dynamic_shape (bool): use dynamic shape or not + trt_min_shape (int): min shape for dynamic shape in trt + trt_max_shape (int): max shape for dynamic shape in trt + trt_opt_shape (int): opt shape for dynamic shape in trt Returns: predictor (PaddlePredictor): AnalysisPredictor Raises: @@ -301,9 +335,12 @@ def load_predictor(model_dir, raise ValueError( "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}" .format(run_mode, use_gpu)) - if run_mode == 'trt_int8': - raise ValueError("TensorRT int8 mode is not supported now, " - "please use trt_fp32 or trt_fp16 instead.") + if run_mode == 'trt_int8' and not os.path.exists( + os.path.join(model_dir, '_opt_cache')): + raise ValueError( + "TensorRT int8 must calibration first, and model_dir must has _opt_cache dir" + ) + use_calib_mode = True if run_mode == 'trt_int8' else False config = Config( os.path.join(model_dir, 'model.pdmodel'), os.path.join(model_dir, 'model.pdiparams')) @@ -316,11 +353,7 @@ def load_predictor(model_dir, # initial GPU memory(M), device ID config.enable_use_gpu(200, 0) # optimize graph and fuse op - # FIXME(dkp): ir optimize may prune variable inside graph - # and incur error in Paddle 2.0, e.g. in SSDLite - # FCOS model, set as False currently and should - # be set as True after switch_ir_optim fixed - config.switch_ir_optim(False) + config.switch_ir_optim(True) else: config.disable_gpu() @@ -331,7 +364,16 @@ def load_predictor(model_dir, min_subgraph_size=min_subgraph_size, precision_mode=precision_map[run_mode], use_static=False, - use_calib_mode=False) + use_calib_mode=use_calib_mode) + + if use_dynamic_shape: + print('use_dynamic_shape') + min_input_shape = {'image': [1, 3, trt_min_shape, trt_min_shape]} + max_input_shape = {'image': [1, 3, trt_max_shape, trt_max_shape]} + opt_input_shape = {'image': [1, 3, trt_opt_shape, trt_opt_shape]} + config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, + opt_input_shape) + print('trt set dynamic shape done!') # disable print log when predict config.disable_glog_info() @@ -424,13 +466,21 @@ def main(): pred_config, FLAGS.model_dir, use_gpu=FLAGS.use_gpu, - run_mode=FLAGS.run_mode) + run_mode=FLAGS.run_mode, + use_dynamic_shape=FLAGS.use_dynamic_shape, + trt_min_shape=FLAGS.trt_min_shape, + trt_max_shape=FLAGS.trt_max_shape, + trt_opt_shape=FLAGS.trt_opt_shape) if pred_config.arch == 'SOLOv2': detector = DetectorSOLOv2( pred_config, FLAGS.model_dir, use_gpu=FLAGS.use_gpu, - run_mode=FLAGS.run_mode) + run_mode=FLAGS.run_mode, + use_dynamic_shape=FLAGS.use_dynamic_shape, + trt_min_shape=FLAGS.trt_min_shape, + trt_max_shape=FLAGS.trt_max_shape, + trt_opt_shape=FLAGS.trt_opt_shape) # predict from image if FLAGS.image_file != '': predict_image(detector) @@ -480,6 +530,23 @@ if __name__ == '__main__': type=str, default="output", help="Directory of output visualization files.") + parser.add_argument( + "--use_dynamic_shape", + type=ast.literal_eval, + default=False, + help="Dynamic_shape for TensorRT.") + parser.add_argument( + "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.") + parser.add_argument( + "--trt_max_shape", + type=int, + default=1280, + help="max_shape for TensorRT.") + parser.add_argument( + "--trt_opt_shape", + type=int, + default=640, + help="opt_shape for TensorRT.") FLAGS = parser.parse_args() print_arguments(FLAGS) diff --git a/dygraph/deploy/python/trt_int8_calib.py b/dygraph/deploy/python/trt_int8_calib.py new file mode 100644 index 0000000000000000000000000000000000000000..32f0e0ddea30a1790428bf67ed0348a60ec74c39 --- /dev/null +++ b/dygraph/deploy/python/trt_int8_calib.py @@ -0,0 +1,300 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import time +import yaml +import ast +from functools import reduce + +from PIL import Image +import cv2 +import numpy as np +import glob +import paddle +from preprocess import preprocess, Resize, NormalizeImage, Permute, PadStride +from visualize import visualize_box_mask +from paddle.inference import Config +from paddle.inference import create_predictor + +# Global dictionary +SUPPORT_MODELS = { + 'YOLO', + 'RCNN', + 'SSD', + 'FCOS', + 'SOLOv2', + 'TTFNet', +} + + +class Detector(object): + """ + Args: + config (object): config of model, defined by `Config(model_dir)` + model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml + use_gpu (bool): whether use gpu + """ + + def __init__(self, pred_config, model_dir, use_gpu=False): + self.pred_config = pred_config + self.predictor = load_predictor( + model_dir, + min_subgraph_size=self.pred_config.min_subgraph_size, + use_gpu=use_gpu) + + def preprocess(self, im): + preprocess_ops = [] + for op_info in self.pred_config.preprocess_infos: + new_op_info = op_info.copy() + op_type = new_op_info.pop('type') + preprocess_ops.append(eval(op_type)(**new_op_info)) + im, im_info = preprocess(im, preprocess_ops, + self.pred_config.input_shape) + inputs = create_inputs(im, im_info) + return inputs + + def postprocess(self, np_boxes, np_masks, inputs, threshold=0.5): + # postprocess output of predictor + results = {} + if self.pred_config.arch in ['Face']: + h, w = inputs['im_shape'] + scale_y, scale_x = inputs['scale_factor'] + w, h = float(h) / scale_y, float(w) / scale_x + np_boxes[:, 2] *= h + np_boxes[:, 3] *= w + np_boxes[:, 4] *= h + np_boxes[:, 5] *= w + results['boxes'] = np_boxes + if np_masks is not None: + results['masks'] = np_masks + return results + + def predict(self, + image, + threshold=0.5, + warmup=0, + repeats=1, + run_benchmark=False): + ''' + Args: + image (str/np.ndarray): path of image/ np.ndarray read by cv2 + threshold (float): threshold of predicted box' score + Returns: + results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box, + matix element:[class, score, x_min, y_min, x_max, y_max] + MaskRCNN's results include 'masks': np.ndarray: + shape: [N, im_h, im_w] + ''' + inputs = self.preprocess(image) + np_boxes, np_masks = None, None + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle(input_names[i]) + input_tensor.copy_from_cpu(inputs[input_names[i]]) + + for i in range(warmup): + self.predictor.run() + output_names = self.predictor.get_output_names() + boxes_tensor = self.predictor.get_output_handle(output_names[0]) + np_boxes = boxes_tensor.copy_to_cpu() + if self.pred_config.mask: + masks_tensor = self.predictor.get_output_handle(output_names[2]) + np_masks = masks_tensor.copy_to_cpu() + + t1 = time.time() + for i in range(repeats): + self.predictor.run() + output_names = self.predictor.get_output_names() + boxes_tensor = self.predictor.get_output_handle(output_names[0]) + np_boxes = boxes_tensor.copy_to_cpu() + if self.pred_config.mask: + masks_tensor = self.predictor.get_output_handle(output_names[2]) + np_masks = masks_tensor.copy_to_cpu() + t2 = time.time() + ms = (t2 - t1) * 1000.0 / repeats + print("Inference: {} ms per batch image".format(ms)) + + # do not perform postprocess in benchmark mode + results = [] + if not run_benchmark: + if reduce(lambda x, y: x * y, np_boxes.shape) < 6: + print('[WARNNING] No object detected.') + results = {'boxes': np.array([])} + else: + results = self.postprocess( + np_boxes, np_masks, inputs, threshold=threshold) + + return results + + +def create_inputs(im, im_info): + """generate input for different model type + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + model_arch (str): model type + Returns: + inputs (dict): input of model + """ + inputs = {} + inputs['image'] = np.array((im, )).astype('float32') + inputs['im_shape'] = np.array((im_info['im_shape'], )).astype('float32') + inputs['scale_factor'] = np.array( + (im_info['scale_factor'], )).astype('float32') + + return inputs + + +class PredictConfig(): + """set config of preprocess, postprocess and visualize + Args: + model_dir (str): root path of model.yml + """ + + def __init__(self, model_dir): + # parsing Yaml config for Preprocess + deploy_file = os.path.join(model_dir, 'infer_cfg.yml') + with open(deploy_file) as f: + yml_conf = yaml.safe_load(f) + self.check_model(yml_conf) + self.arch = yml_conf['arch'] + self.preprocess_infos = yml_conf['Preprocess'] + self.min_subgraph_size = yml_conf['min_subgraph_size'] + self.labels = yml_conf['label_list'] + self.mask = False + if 'mask' in yml_conf: + self.mask = yml_conf['mask'] + self.input_shape = yml_conf['image_shape'] + self.print_config() + + def check_model(self, yml_conf): + """ + Raises: + ValueError: loaded model not in supported model type + """ + for support_model in SUPPORT_MODELS: + if support_model in yml_conf['arch']: + return True + raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[ + 'arch'], SUPPORT_MODELS)) + + def print_config(self): + print('----------- Model Configuration -----------') + print('%s: %s' % ('Model Arch', self.arch)) + print('%s: ' % ('Transform Order')) + for op_info in self.preprocess_infos: + print('--%s: %s' % ('transform op', op_info['type'])) + print('--------------------------------------------') + + +def load_predictor(model_dir, batch_size=1, use_gpu=False, min_subgraph_size=3): + """set AnalysisConfig, generate AnalysisPredictor + Args: + model_dir (str): root path of __model__ and __params__ + use_gpu (bool): whether use gpu + Returns: + predictor (PaddlePredictor): AnalysisPredictor + Raises: + ValueError: predict by TensorRT need use_gpu == True. + """ + run_mode = 'trt_int8' + if not use_gpu and not run_mode == 'fluid': + raise ValueError( + "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}" + .format(run_mode, use_gpu)) + config = Config( + os.path.join(model_dir, 'model.pdmodel'), + os.path.join(model_dir, 'model.pdiparams')) + precision_map = { + 'trt_int8': Config.Precision.Int8, + 'trt_fp32': Config.Precision.Float32, + 'trt_fp16': Config.Precision.Half + } + if use_gpu: + # initial GPU memory(M), device ID + config.enable_use_gpu(200, 0) + # optimize graph and fuse op + config.switch_ir_optim(True) + else: + config.disable_gpu() + + if run_mode in precision_map.keys(): + config.enable_tensorrt_engine( + workspace_size=1 << 10, + max_batch_size=batch_size, + min_subgraph_size=min_subgraph_size, + precision_mode=precision_map[run_mode], + use_static=False, + use_calib_mode=True) + + # disable print log when predict + config.disable_glog_info() + # enable shared memory + config.enable_memory_optim() + # disable feed, fetch OP, needed by zero_copy_run + config.switch_use_feed_fetch_ops(False) + predictor = create_predictor(config) + return predictor + + +def print_arguments(args): + print('----------- Running Arguments -----------') + for arg, value in sorted(vars(args).items()): + print('%s: %s' % (arg, value)) + print('------------------------------------------') + + +def predict_image_dir(detector): + for image_file in glob.glob(FLAGS.image_dir + '/*.jpg'): + print('image_file is', image_file) + results = detector.predict(image_file, threshold=0.5) + + +def main(): + pred_config = PredictConfig(FLAGS.model_dir) + detector = Detector(pred_config, FLAGS.model_dir, use_gpu=FLAGS.use_gpu) + # predict from image + if FLAGS.image_dir != '': + predict_image_dir(detector) + + +if __name__ == '__main__': + paddle.enable_static() + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--model_dir", + type=str, + default=None, + help=("Directory include:'model.pdiparams', 'model.pdmodel', " + "'infer_cfg.yml', created by tools/export_model.py."), + required=True) + parser.add_argument( + "--image_dir", type=str, default='', help="Directory of image file.") + parser.add_argument( + "--use_gpu", + type=ast.literal_eval, + default=False, + help="Whether to predict with GPU.") + print('err?') + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Directory of output visualization files.") + FLAGS = parser.parse_args() + print_arguments(FLAGS) + + main() diff --git a/dygraph/deploy/tensorrt/README.md b/dygraph/deploy/tensorrt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ccf46603f0c38f5494aba7be956767c8b5716eba --- /dev/null +++ b/dygraph/deploy/tensorrt/README.md @@ -0,0 +1,76 @@ +# TensorRT预测部署教程 +TensorRT是NVIDIA提出的用于统一模型部署的加速库,可以应用于V100、JETSON Xavier等硬件,它可以极大提高预测速度。Paddle TensorRT教程请参考文档[使用Paddle-TensorRT库预测](https://paddle-inference.readthedocs.io/en/latest/optimize/paddle_trt.html#) + +## 1. 安装PaddleInference预测库 +- Python安装包,请从[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-release) 下载带有tensorrt的安装包进行安装 + +- CPP预测库,请从[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) 下载带有TensorRT编译的预测库 + +- 如果Python和CPP官网没有提供已编译好的安装包或预测库,请参考[源码安装](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/linux-compile.html) 自行编译 + +注意,您的机器上TensorRT的版本需要跟您使用的预测库中TensorRT版本保持一致。 + +## 2. 导出模型 +模型导出具体请参考文档[PaddleDetection模型导出教程](../EXPORT_MODEL.md)。 + +## 3. 开启TensorRT加速 +### 3.1 配置TensorRT +在使用Paddle预测库构建预测器配置config时,打开TensorRT引擎就可以了: + +``` +config->EnableUseGpu(100, 0); // 初始化100M显存,使用GPU ID为0 +config->GpuDeviceId(); // 返回正在使用的GPU ID +// 开启TensorRT预测,可提升GPU预测性能,需要使用带TensorRT的预测库 +config->EnableTensorRtEngine(1 << 20 /*workspace_size*/, + batch_size /*max_batch_size*/, + 3 /*min_subgraph_size*/, + AnalysisConfig::Precision::kFloat32 /*precision*/, + false /*use_static*/, + false /*use_calib_mode*/); + +``` + +### 3.2 TensorRT固定尺寸预测 +TensorRT版本<=5时,使用TensorRT预测时,只支持固定尺寸输入。 + +在导出模型时指定模型输入尺寸,设置`TestReader.inputs_def.image_shape=[3,640,640]`,具体请参考[PaddleDetection模型导出教程](../EXPORT_MODEL.md) 。 + +`TestReader.inputs_def.image_shape`设置的是输入TensorRT引擎的数据尺寸(在像FasterRCNN中,`TestReader.inputs_def.image_shape`指定的是在`Pad`操作之前的图像数据尺寸)。 + +可以通过[visualdl](https://www.paddlepaddle.org.cn/paddle/visualdl/demo/graph) 打开`model.pdmodel`文件,查看输入的第一个Tensor尺寸是否是固定的,如果不指定,尺寸会用`?`表示,如下图所示: +![img](imgs/input_shape.png) + +### 3.3 TensorRT动态尺寸预测 + +TensorRT版本>=6时,使用TensorRT预测时,可以支持动态尺寸输入。 +Paddle预测库关于动态尺寸输入请查看[Paddle CPP预测](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/native_infer.html) 的`SetTRTDynamicShapeInfo`函数说明。 + +`python/infer.py`设置动态尺寸输入参数说明: + +- use_dynamic_shape 用于设定TensorRT的输入尺寸是否是动态尺寸,默认值:False + +- trt_min_shape 用于设定TensorRT的输入图像height、width中的最小尺寸,默认值:1 + +- trt_max_shape 用于设定TensorRT的输入图像height、width中的最大尺寸,默认值:1280 + +- trt_opt_shape 用于设定TensorRT的输入图像height、width中的最优尺寸,默认值:640 + +**注意:`TensorRT`中动态尺寸设置是4维的,这里只设置输入图像的尺寸。** + +## 4、常见问题QA +**Q:** 提示没有`tensorrt_op`
+**A:** 请检查是否使用带有TensorRT的Paddle Python包或预测库。 + +**Q:** 提示`op out of memory`
+**A:** 检查GPU是否是别人也在使用,请尝试使用空闲GPU + +**Q:** 提示`some trt inputs dynamic shape info not set`
+**A:** 这是由于`TensorRT`会把网络结果划分成多个子图,我们只设置了输入数据的动态尺寸,划分的其他子图的输入并未设置动态尺寸。有两个解决方法: + +- 方法一:通过增大`min_subgraph_size`,跳过对这些子图的优化。根据提示,设置min_subgraph_size大于并未设置动态尺寸输入的子图中OP个数即可。 +`min_subgraph_size`的意思是,在加载TensorRT引擎的时候,大于`min_subgraph_size`的OP才会被优化,并且这些OP是连续的且是TensorRT可以优化的。 + +- 方法二:找到子图的这些输入,按照上面方式也设置子图的输入动态尺寸。 + +**Q:** 如何打开日志
+**A:** 预测库默认是打开日志的,只要注释掉`config.disable_glog_info()`就可以打开日志 diff --git a/dygraph/deploy/tensorrt/imgs/input_shape.png b/dygraph/deploy/tensorrt/imgs/input_shape.png new file mode 100644 index 0000000000000000000000000000000000000000..1148116f81ec78ae625f342fa51dcf778d1fb4ca Binary files /dev/null and b/dygraph/deploy/tensorrt/imgs/input_shape.png differ