From 3cdad373da49adaa68874126347d077c9f33e10b Mon Sep 17 00:00:00 2001 From: MRXLT Date: Tue, 11 Aug 2020 16:45:03 +0800 Subject: [PATCH] add trt --- CMakeLists.txt | 1 + cmake/paddlepaddle.cmake | 5 +++- core/configure/proto/server_configure.proto | 1 + .../include/fluid_gpu_engine.h | 28 ++++++++----------- python/paddle_serving_server_gpu/__init__.py | 7 +++++ python/paddle_serving_server_gpu/serve.py | 2 ++ 6 files changed, 27 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c497e3e..59d6fcb0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(SERVER "Compile Paddle Serving Server" OFF) option(APP "Compile Paddle Serving App package" OFF) option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution" OFF) option(PACK "Compile for whl" OFF) +option(WITH_TRT "Compile Paddle Serving with TRT" OFF) set(WITH_MKLML ${WITH_MKL}) if (NOT DEFINED WITH_MKLDNN) diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake index 54aae0bd..eee5369f 100644 --- a/cmake/paddlepaddle.cmake +++ b/cmake/paddlepaddle.cmake @@ -34,7 +34,10 @@ message( "WITH_GPU = ${WITH_GPU}") SET(PADDLE_VERSION "1.8.1") if (WITH_GPU) - SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl") + if (WITH_TRT) + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6") + else() + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl") else() if (WITH_AVX) if (WITH_MKLML) diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index 89560226..3dfc1db4 100644 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -44,6 +44,7 @@ message EngineDesc { optional bool static_optimization = 14; optional bool force_update_static_cache = 15; optional bool enable_ir_optimization = 16; + optional bool use_trt = 17; }; // model_toolkit conf diff --git a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h index 2a4da4b9..e9af5ece 100644 --- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h +++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h @@ -178,7 +178,7 @@ class FluidGpuNativeCore : public FluidFamilyCore { } }; -class FluidGpuAnalysisDirCore : public FluidFamilyCore { +class FluidTRTAnalysisDirCore : public FluidFamilyCore { public: int create(const predictor::InferEngineCreationParams& params) { std::string data_path = params.get_path(); @@ -198,13 +198,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { analysis_config.EnableMemoryOptim(); } - /* - if (params.enable_ir_optimization()) { - analysis_config.SwitchIrOptim(true); - } else { - analysis_config.SwitchIrOptim(false); - } - */ +#if 0 int min_seq_len = 1; int max_seq_len = 512; @@ -241,16 +235,18 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { {input4_name, {batch, head_number, opt_seq_len, opt_seq_len}}, }; - analysis_config.EnableTensorRtEngine( - 1 << 30, - batch, - 5, - paddle::AnalysisConfig::Precision::kHalf, - true, - true); analysis_config.SetTRTDynamicShapeInfo( min_input_shape, max_input_shape, opt_input_shape); - +#endif + if (params.use_trt()) { + analysis_config.EnableTensorRtEngine( + 1 << 30, + batch, + 5, + paddle::AnalysisConfig::Precision::kFloat32, + true, + true); + } AutoLock lock(GlobalPaddleCreateMutex::instance()); _core = paddle::CreatePaddlePredictor(analysis_config); diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py index becfbb20..0d6936e9 100644 --- a/python/paddle_serving_server_gpu/__init__.py +++ b/python/paddle_serving_server_gpu/__init__.py @@ -73,6 +73,8 @@ def serve_args(): default=False, action="store_true", help="Use Multi-language-service") + parser.add_argument( + "--use_trt", default=False, action="store_true", help="Use TensorRT") return parser.parse_args() @@ -195,6 +197,7 @@ class Server(object): self.cur_path = os.getcwd() self.use_local_bin = False self.gpuid = 0 + self.use_trt = False self.model_config_paths = None # for multi-model in a workflow def set_max_concurrency(self, concurrency): @@ -245,6 +248,9 @@ class Server(object): def set_gpuid(self, gpuid=0): self.gpuid = gpuid + def use_trt(self): + self.use_trt = True + def _prepare_engine(self, model_config_paths, device): if self.model_toolkit_conf == None: self.model_toolkit_conf = server_sdk.ModelToolkitConf() @@ -264,6 +270,7 @@ class Server(object): engine.enable_ir_optimization = self.ir_optimization engine.static_optimization = False engine.force_update_static_cache = False + engine.use_trt = self.use_trt if device == "cpu": engine.type = "FLUID_CPU_ANALYSIS_DIR" diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py index 3b0941a9..8f16e0c7 100644 --- a/python/paddle_serving_server_gpu/serve.py +++ b/python/paddle_serving_server_gpu/serve.py @@ -64,6 +64,8 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss server.set_memory_optimize(mem_optim) server.set_ir_optimize(ir_optim) server.set_max_body_size(max_body_size) + if args.use_trt: + server.use_trt() server.load_model_config(model) server.prepare_server(workdir=workdir, port=port, device=device) -- GitLab