diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c497e3e048c4dd8d5c1291286de2ab9d218b914..59d6fcb07d27e1f3ab259e69d36708b775c1852a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(SERVER "Compile Paddle Serving Server" OFF) option(APP "Compile Paddle Serving App package" OFF) option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution" OFF) option(PACK "Compile for whl" OFF) +option(WITH_TRT "Compile Paddle Serving with TRT" OFF) set(WITH_MKLML ${WITH_MKL}) if (NOT DEFINED WITH_MKLDNN) diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake index 54aae0bdc249c6eacbd4bf6b5cc42cbba9f08784..eee5369f775fae11a49ca92681a6ae84953617f0 100644 --- a/cmake/paddlepaddle.cmake +++ b/cmake/paddlepaddle.cmake @@ -34,7 +34,10 @@ message( "WITH_GPU = ${WITH_GPU}") SET(PADDLE_VERSION "1.8.1") if (WITH_GPU) - SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl") + if (WITH_TRT) + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6") + else() + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl") else() if (WITH_AVX) if (WITH_MKLML) diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index 8956022685090c94be2037445c646e9fbffd1a5c..3dfc1db4412c95c9e82c7c5c2a21a29519b84267 100644 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -44,6 +44,7 @@ message EngineDesc { optional bool static_optimization = 14; optional bool force_update_static_cache = 15; optional bool enable_ir_optimization = 16; + optional bool use_trt = 17; }; // model_toolkit conf diff --git a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h index 2a4da4b9b03e716b9e8148dbfd0200b887ee66e1..e9af5ecea1a170b49622abde85a9bea3af52cecf 100644 --- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h +++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h @@ -178,7 +178,7 @@ class FluidGpuNativeCore : public FluidFamilyCore { } }; -class FluidGpuAnalysisDirCore : public FluidFamilyCore { +class FluidTRTAnalysisDirCore : public FluidFamilyCore { public: int create(const predictor::InferEngineCreationParams& params) { std::string data_path = params.get_path(); @@ -198,13 +198,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { analysis_config.EnableMemoryOptim(); } - /* - if (params.enable_ir_optimization()) { - analysis_config.SwitchIrOptim(true); - } else { - analysis_config.SwitchIrOptim(false); - } - */ +#if 0 int min_seq_len = 1; int max_seq_len = 512; @@ -241,16 +235,18 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { {input4_name, {batch, head_number, opt_seq_len, opt_seq_len}}, }; - analysis_config.EnableTensorRtEngine( - 1 << 30, - batch, - 5, - paddle::AnalysisConfig::Precision::kHalf, - true, - true); analysis_config.SetTRTDynamicShapeInfo( min_input_shape, max_input_shape, opt_input_shape); - +#endif + if (params.use_trt()) { + analysis_config.EnableTensorRtEngine( + 1 << 30, + batch, + 5, + paddle::AnalysisConfig::Precision::kFloat32, + true, + true); + } AutoLock lock(GlobalPaddleCreateMutex::instance()); _core = paddle::CreatePaddlePredictor(analysis_config); diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py index becfbb20090799aaf40d79973964e497cf599436..0d6936e94a74ab72c92ef7870fce02598a5e5306 100644 --- a/python/paddle_serving_server_gpu/__init__.py +++ b/python/paddle_serving_server_gpu/__init__.py @@ -73,6 +73,8 @@ def serve_args(): default=False, action="store_true", help="Use Multi-language-service") + parser.add_argument( + "--use_trt", default=False, action="store_true", help="Use TensorRT") return parser.parse_args() @@ -195,6 +197,7 @@ class Server(object): self.cur_path = os.getcwd() self.use_local_bin = False self.gpuid = 0 + self.use_trt = False self.model_config_paths = None # for multi-model in a workflow def set_max_concurrency(self, concurrency): @@ -245,6 +248,9 @@ class Server(object): def set_gpuid(self, gpuid=0): self.gpuid = gpuid + def use_trt(self): + self.use_trt = True + def _prepare_engine(self, model_config_paths, device): if self.model_toolkit_conf == None: self.model_toolkit_conf = server_sdk.ModelToolkitConf() @@ -264,6 +270,7 @@ class Server(object): engine.enable_ir_optimization = self.ir_optimization engine.static_optimization = False engine.force_update_static_cache = False + engine.use_trt = self.use_trt if device == "cpu": engine.type = "FLUID_CPU_ANALYSIS_DIR" diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py index 3b0941a97560f11a52808fc7e152419e2cec0ba0..8f16e0c7a266c7554475d2a3c07a762cad88b91d 100644 --- a/python/paddle_serving_server_gpu/serve.py +++ b/python/paddle_serving_server_gpu/serve.py @@ -64,6 +64,8 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss server.set_memory_optimize(mem_optim) server.set_ir_optimize(ir_optim) server.set_max_body_size(max_body_size) + if args.use_trt: + server.use_trt() server.load_model_config(model) server.prepare_server(workdir=workdir, port=port, device=device)