diff --git a/doc/TensorRT_Dynamic_Shape_CN.md b/doc/TensorRT_Dynamic_Shape_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..7ace7933baacdabb279334114c7fd0bfdca34b69 --- /dev/null +++ b/doc/TensorRT_Dynamic_Shape_CN.md @@ -0,0 +1,154 @@ +# 如何配置TensorRT动态shape +(简体中文|[English](./TensorRT_Dynamic_Shape_EN.md)) + +## 引言 + +在Pipeline/C++开启TensorRT[`--use_trt`]后,关于如何进行动态shape的配置,以下会分别给出Pipeline Serving和C++ Serving示例 + +以下是动态shape api +``` + void SetTRTDynamicShapeInfo( + std::map> min_input_shape, + std::map> max_input_shape, + std::map> optim_input_shape, + bool disable_trt_plugin_fp16 = false); +``` +具体API说明请参考[C++](https://paddleinference.paddlepaddle.org.cn/api_reference/cxx_api_doc/Config/GPUConfig.html#tensorrt)/[Python](https://paddleinference.paddlepaddle.org.cn/api_reference/python_api_doc/Config/GPUConfig.html#tensorrt) + +### C++ Serving +在`**/paddle_inference/paddle/include/paddle_engine.h` 修改如下代码 + +``` + if (engine_conf.has_use_trt() && engine_conf.use_trt()) { + config.SwitchIrOptim(true); + if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) { + config.EnableUseGpu(50, gpu_id); + if (engine_conf.has_gpu_multi_stream() && + engine_conf.gpu_multi_stream()) { + config.EnableGpuMultiStream(); + } + } + config.EnableTensorRtEngine((1 << 30) + (1 << 29), + max_batch, + min_subgraph_size, + precision_type, + true, + FLAGS_use_calib); + // set trt dynamic shape + { + int bsz = 1; + int max_seq_len = 512; + std::map> min_input_shape; + std::map> max_input_shape; + std::map> optim_input_shape; + int hidden_size = 0; + + min_input_shape["stack_0.tmp_0"] = {1, 16, 1, 1}; + min_input_shape["stack_1.tmp_0"] = {1, 2, 1, 1}; + min_input_shape["input_mask"] = {1, 1, 1}; + min_input_shape["_generated_var_64"] = {1, 1, 768}; + min_input_shape["fc_0.tmp_0"] = {1, 1, 768}; + min_input_shape["_generated_var_87"] = {1, 1, 768}; + min_input_shape["tmp_175"] = {1, 1, 768}; + min_input_shape["c_allreduce_sum_0.tmp_0"] = {1,1, 12288}; + min_input_shape["embedding_1.tmp_0"] = {1, 1, 12288}; + + + max_input_shape["stack_0.tmp_0"] = {bsz, 16, max_seq_len, max_seq_len}; + max_input_shape["stack_1.tmp_0"] = {bsz, 2, max_seq_len, max_seq_len}; + max_input_shape["input_mask"] = {bsz, max_seq_len, max_seq_len}; + max_input_shape["_generated_var_64"] = {bsz, max_seq_len, 768}; + max_input_shape["fc_0.tmp_0"] = {bsz, max_seq_len, 768}; + max_input_shape["_generated_var_87"] = {bsz, max_seq_len, 768}; + max_input_shape["tmp_175"] = {bsz, max_seq_len, 768}; + max_input_shape["c_allreduce_sum_0.tmp_0"] = {bsz,max_seq_len, 12288}; + max_input_shape["embedding_1.tmp_0"] = {bsz, max_seq_len, 12288}; + + int g1 = 0; + int g2 = 0; + int t1 = 0; + int t2 = 0; + std::string var_name = "_generated_var_"; + std::string tmp_name = "tmp_"; + for (int i = 0; i < 44; ++i) { + if (i > 32) { + hidden_size = 768; + g1 = 2*i-1; + g2 = 2*i; + t1 = 4*i-1; + t2 = 4*i; + min_input_shape[var_name+std::to_string(g1)] = {1, 1, hidden_size}; + min_input_shape[var_name+std::to_string(g2)] = {1, 1, hidden_size}; + min_input_shape[tmp_name+std::to_string(t1)] = {1, 1, hidden_size}; + min_input_shape[tmp_name+std::to_string(t2)] = {1, 1, hidden_size}; + max_input_shape[var_name+std::to_string(g1)] = {bsz, max_seq_len, hidden_size}; + max_input_shape[var_name+std::to_string(g2)] = {bsz, max_seq_len, hidden_size}; + max_input_shape[tmp_name+std::to_string(t1)] = {bsz, max_seq_len, hidden_size}; + max_input_shape[tmp_name+std::to_string(t2)] = {bsz, max_seq_len, hidden_size}; + } + if (i <32) { + hidden_size = 12288; + g1 = 2*i; + g2 = 2*i+1; + t1 = 4*i; + t2 = 4*i+3; + min_input_shape[var_name+std::to_string(g1)] = {1, 1, hidden_size}; + min_input_shape[var_name+std::to_string(g2)] = {1, 1, hidden_size}; + min_input_shape[tmp_name+std::to_string(t1)] = {1, 1, hidden_size}; + min_input_shape[tmp_name+std::to_string(t2)] = {1, 1, hidden_size}; + max_input_shape[var_name+std::to_string(g1)] = {bsz, max_seq_len, hidden_size}; + max_input_shape[var_name+std::to_string(g2)] = {bsz, max_seq_len, hidden_size}; + max_input_shape[tmp_name+std::to_string(t1)] = {bsz, max_seq_len, hidden_size}; + max_input_shape[tmp_name+std::to_string(t2)] = {bsz, max_seq_len, hidden_size}; + } + } + optim_input_shape = max_input_shape; + config.SetTRTDynamicShapeInfo( + min_input_shape, max_input_shape, optim_input_shape); + } + LOG(INFO) << "create TensorRT predictor"; + } +``` + + +### Pipeline Serving + +在`**/python/paddle_serving_app/local_predict.py`中修改如下代码 + +``` +if use_trt: + config.enable_tensorrt_engine( + precision_mode=precision_type, + workspace_size=1 << 20, + max_batch_size=32, + min_subgraph_size=3, + use_static=False, + use_calib_mode=False) + head_number = 12 + + names = [ + "placeholder_0", "placeholder_1", "placeholder_2", "stack_0.tmp_0" + ] + min_input_shape = [1, 1, 1] + max_input_shape = [100, 128, 1] + opt_input_shape = [10, 60, 1] + + config.set_trt_dynamic_shape_info( + { + names[0]: min_input_shape, + names[1]: min_input_shape, + names[2]: min_input_shape, + names[3]: [1, head_number, 1, 1] + }, { + names[0]: max_input_shape, + names[1]: max_input_shape, + names[2]: max_input_shape, + names[3]: [100, head_number, 128, 128] + }, { + names[0]: opt_input_shape, + names[1]: opt_input_shape, + names[2]: opt_input_shape, + names[3]: [10, head_number, 60, 60] + }) + +``` \ No newline at end of file