提交 3cdad373 编写于 作者: M MRXLT

add trt

上级 460c43b8
......@@ -54,6 +54,7 @@ option(SERVER "Compile Paddle Serving Server" OFF)
option(APP "Compile Paddle Serving App package" OFF)
option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution" OFF)
option(PACK "Compile for whl" OFF)
option(WITH_TRT "Compile Paddle Serving with TRT" OFF)
set(WITH_MKLML ${WITH_MKL})
if (NOT DEFINED WITH_MKLDNN)
......
......@@ -34,7 +34,10 @@ message( "WITH_GPU = ${WITH_GPU}")
SET(PADDLE_VERSION "1.8.1")
if (WITH_GPU)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
if (WITH_TRT)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6")
else()
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
else()
if (WITH_AVX)
if (WITH_MKLML)
......
......@@ -44,6 +44,7 @@ message EngineDesc {
optional bool static_optimization = 14;
optional bool force_update_static_cache = 15;
optional bool enable_ir_optimization = 16;
optional bool use_trt = 17;
};
// model_toolkit conf
......
......@@ -178,7 +178,7 @@ class FluidGpuNativeCore : public FluidFamilyCore {
}
};
class FluidGpuAnalysisDirCore : public FluidFamilyCore {
class FluidTRTAnalysisDirCore : public FluidFamilyCore {
public:
int create(const predictor::InferEngineCreationParams& params) {
std::string data_path = params.get_path();
......@@ -198,13 +198,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
analysis_config.EnableMemoryOptim();
}
/*
if (params.enable_ir_optimization()) {
analysis_config.SwitchIrOptim(true);
} else {
analysis_config.SwitchIrOptim(false);
}
*/
#if 0
int min_seq_len = 1;
int max_seq_len = 512;
......@@ -241,16 +235,18 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
{input4_name, {batch, head_number, opt_seq_len, opt_seq_len}},
};
analysis_config.EnableTensorRtEngine(
1 << 30,
batch,
5,
paddle::AnalysisConfig::Precision::kHalf,
true,
true);
analysis_config.SetTRTDynamicShapeInfo(
min_input_shape, max_input_shape, opt_input_shape);
#endif
if (params.use_trt()) {
analysis_config.EnableTensorRtEngine(
1 << 30,
batch,
5,
paddle::AnalysisConfig::Precision::kFloat32,
true,
true);
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core =
paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
......
......@@ -73,6 +73,8 @@ def serve_args():
default=False,
action="store_true",
help="Use Multi-language-service")
parser.add_argument(
"--use_trt", default=False, action="store_true", help="Use TensorRT")
return parser.parse_args()
......@@ -195,6 +197,7 @@ class Server(object):
self.cur_path = os.getcwd()
self.use_local_bin = False
self.gpuid = 0
self.use_trt = False
self.model_config_paths = None # for multi-model in a workflow
def set_max_concurrency(self, concurrency):
......@@ -245,6 +248,9 @@ class Server(object):
def set_gpuid(self, gpuid=0):
self.gpuid = gpuid
def use_trt(self):
self.use_trt = True
def _prepare_engine(self, model_config_paths, device):
if self.model_toolkit_conf == None:
self.model_toolkit_conf = server_sdk.ModelToolkitConf()
......@@ -264,6 +270,7 @@ class Server(object):
engine.enable_ir_optimization = self.ir_optimization
engine.static_optimization = False
engine.force_update_static_cache = False
engine.use_trt = self.use_trt
if device == "cpu":
engine.type = "FLUID_CPU_ANALYSIS_DIR"
......
......@@ -64,6 +64,8 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss
server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim)
server.set_max_body_size(max_body_size)
if args.use_trt:
server.use_trt()
server.load_model_config(model)
server.prepare_server(workdir=workdir, port=port, device=device)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册