提交 ccfaad36 编写于 作者: F felixhjh

configure dynamic shape tensorrt

上级 27d9e17c
......@@ -37,7 +37,7 @@ op:
model_config: ocr_det_model
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["concat_1.tmp_0"]
fetch_list: ["save_infer_model/scale_0.tmp_1"]
# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 0
......@@ -53,6 +53,9 @@ op:
#ir_optim
ir_optim: True
#开启tensorrt后,进行优化的子图包含的最少节点数
#min_subgraph_size: 13
rec:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 3
......@@ -73,7 +76,7 @@ op:
model_config: ocr_rec_model
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
fetch_list: ["save_infer_model/scale_0.tmp_1"]
# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 0
......@@ -88,3 +91,6 @@ op:
#ir_optim
ir_optim: True
#开启tensorrt后,进行优化的子图包含的最少节点数
#min_subgraph_size: 3
......@@ -40,6 +40,37 @@ class DetOp(Op):
"min_size": 3
})
def set_dynamic_shape_info(self):
min_input_shape = {
"x": [1, 3, 50, 50],
"conv2d_182.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_2.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_3.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_4.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_5.tmp_0": [1, 1, 20, 20]
}
max_input_shape = {
"x": [1, 3, 1536, 1536],
"conv2d_182.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_2.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_3.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_4.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_5.tmp_0": [20, 200, 960, 960],
}
opt_input_shape = {
"x": [1, 3, 960, 960],
"conv2d_182.tmp_0": [3, 96, 240, 240],
"nearest_interp_v2_2.tmp_0": [3, 96, 240, 240],
"nearest_interp_v2_3.tmp_0": [3, 24, 240, 240],
"nearest_interp_v2_4.tmp_0": [3, 24, 240, 240],
"nearest_interp_v2_5.tmp_0": [3, 24, 240, 240],
}
self.dynamic_shape_info = {
"min_input_shape": min_input_shape,
"max_input_shape": max_input_shape,
"opt_input_shape": opt_input_shape,
}
def preprocess(self, input_dicts, data_id, log_id):
(_, input_dict), = input_dicts.items()
imgs = []
......@@ -52,11 +83,11 @@ class DetOp(Op):
det_img = self.det_preprocess(self.im)
_, self.new_h, self.new_w = det_img.shape
imgs.append(det_img[np.newaxis, :].copy())
return {"image": np.concatenate(imgs, axis=0)}, False, None, ""
return {"x": np.concatenate(imgs, axis=0)}, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
# print(fetch_dict)
det_out = fetch_dict["concat_1.tmp_0"]
det_out = fetch_dict["save_infer_model/scale_0.tmp_1"]
ratio_list = [
float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
]
......@@ -71,6 +102,25 @@ class RecOp(Op):
self.ocr_reader = OCRReader()
self.get_rotate_crop_image = GetRotateCropImage()
self.sorted_boxes = SortedBoxes()
def set_dynamic_shape_info(self):
min_input_shape = {
"x": [1, 3, 32, 10],
"lstm_1.tmp_0": [1, 1, 128]
}
max_input_shape = {
"x": [50, 3, 32, 1000],
"lstm_1.tmp_0": [500, 50, 128]
}
opt_input_shape = {
"x": [6, 3, 32, 100],
"lstm_1.tmp_0": [25, 5, 128]
}
self.dynamic_shape_info = {
"min_input_shape": min_input_shape,
"max_input_shape": max_input_shape,
"opt_input_shape": opt_input_shape,
}
def preprocess(self, input_dicts, data_id, log_id):
(_, input_dict), = input_dicts.items()
......@@ -143,7 +193,7 @@ class RecOp(Op):
for id, img in enumerate(img_list):
norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
imgs[id] = norm_img
feed = {"image": imgs.copy()}
feed = {"x": imgs.copy()}
feed_list.append(feed)
#_LOGGER.info("feed_list : {}".format(feed_list))
......
......@@ -88,7 +88,9 @@ class LocalPredictor(object):
mkldnn_op_list=None,
mkldnn_bf16_op_list=None,
use_feed_fetch_ops=False,
use_ascend_cl=False):
use_ascend_cl=False,
min_subgraph_size=3,
dynamic_shape_info={}):
"""
Load model configs and create the paddle predictor by Paddle Inference API.
......@@ -102,6 +104,9 @@ class LocalPredictor(object):
ir_optim: open calculation chart optimization, False default.
use_trt: use nvidia TensorRT optimization, False default
use_lite: use Paddle-Lite Engint, False default
ir_optim: open calculation chart optimization, False default.
use_trt: use nvidia TensorRT optimization, False default
use_lite: use Paddle-Lite Engint, False default
use_xpu: run predict on Baidu Kunlun, False default
precision: precision mode, "fp32" default
use_calib: use TensorRT calibration, False default
......@@ -211,9 +216,13 @@ class LocalPredictor(object):
precision_mode=precision_type,
workspace_size=1 << 20,
max_batch_size=32,
min_subgraph_size=3,
min_subgraph_size=min_subgraph_size,
use_static=False,
use_calib_mode=False)
if len(dynamic_shape_info):
config.set_trt_dynamic_shape_info(
dynamic_shape_info['min_input_shape'], dynamic_shape_info['max_input_shape'], dynamic_shape_info['opt_input_shape'])
# set lite
if use_lite:
config.enable_lite_engine(
......
......@@ -50,7 +50,9 @@ class LocalServiceHandler(object):
use_mkldnn=False,
mkldnn_cache_capacity=0,
mkldnn_op_list=None,
mkldnn_bf16_op_list=None):
mkldnn_bf16_op_list=None,
min_subgraph_size=3,
dynamic_shape_info={}):
"""
Initialization of localservicehandler
......@@ -92,6 +94,8 @@ class LocalServiceHandler(object):
self._mkldnn_cache_capacity = 0
self._mkldnn_op_list = None
self._mkldnn_bf16_op_list = None
self.min_subgraph_size = 3
self.dynamic_shape_info = {}
if device_type == -1:
# device_type is not set, determined by `devices`,
......@@ -120,6 +124,8 @@ class LocalServiceHandler(object):
self._use_gpu = True
devices = [int(x) for x in devices.split(",")]
self._use_trt = True
self.min_subgraph_size = min_subgraph_size
self.dynamic_shape_info = dynamic_shape_info
elif device_type == 3:
# ARM CPU
self._device_name = "arm"
......@@ -176,14 +182,14 @@ class LocalServiceHandler(object):
"mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
"client_type:{}, fetch_names:{}, precision:{}, use_mkldnn:{}, "
"mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
"mkldnn_bf16_op_list:{}, use_ascend_cl:{}".format(
"mkldnn_bf16_op_list:{}, use_ascend_cl:{}, min_subgraph_size:{}".format(
model_config, self._device_name, self._use_gpu, self._use_trt,
self._use_lite, self._use_xpu, device_type, self._devices,
self._mem_optim, self._ir_optim, self._use_profile,
self._thread_num, self._client_type, self._fetch_names,
self._precision, self._use_mkldnn, self._mkldnn_cache_capacity,
self._mkldnn_op_list, self._mkldnn_bf16_op_list,
self._use_ascend_cl))
self._use_ascend_cl, self.min_subgraph_size))
def get_fetch_list(self):
return self._fetch_names
......@@ -240,7 +246,9 @@ class LocalServiceHandler(object):
mkldnn_cache_capacity=self._mkldnn_cache_capacity,
mkldnn_op_list=self._mkldnn_op_list,
mkldnn_bf16_op_list=self._mkldnn_bf16_op_list,
use_ascend_cl=self._use_ascend_cl)
use_ascend_cl=self._use_ascend_cl,
min_subgraph_size=self.min_subgraph_size,
dynamic_shape_info=self.dynamic_shape_info)
return self._local_predictor_client
def get_client_config(self):
......
......@@ -116,6 +116,11 @@ class Op(object):
self._for_close_op_lock = threading.Lock()
self._succ_init_op = False
self._succ_close_op = False
self.dynamic_shape_info = {}
self.set_dynamic_shape_info()
def set_dynamic_shape_info(self):
pass
# for feed/fetch dict cehck
@staticmethod
......@@ -182,6 +187,7 @@ class Op(object):
self.mkldnn_cache_capacity = 0
self.mkldnn_op_list = None
self.mkldnn_bf16_op_list = None
self.min_subgraph_size = 3
if self._server_endpoints is None:
server_endpoints = conf.get("server_endpoints", [])
......@@ -212,6 +218,8 @@ class Op(object):
"mkldnn_op_list")
self.mkldnn_bf16_op_list = local_service_conf.get(
"mkldnn_bf16_op_list")
self.min_subgraph_size = local_service_conf.get(
"min_subgraph_size")
if self.model_config is None:
self.with_serving = False
......@@ -233,7 +241,9 @@ class Op(object):
mkldnn_cache_capacity=self.
mkldnn_cache_capacity,
mkldnn_op_list=self.mkldnn_bf16_op_list,
mkldnn_bf16_op_list=self.mkldnn_bf16_op_list)
mkldnn_bf16_op_list=self.mkldnn_bf16_op_list,
min_subgraph_size=self.min_subgraph_size,
dynamic_shape_info=self.dynamic_shape_info)
service_handler.prepare_server() # get fetch_list
serivce_ports = service_handler.get_port_list()
self._server_endpoints = [
......@@ -261,7 +271,9 @@ class Op(object):
mkldnn_cache_capacity=self.
mkldnn_cache_capacity,
mkldnn_op_list=self.mkldnn_op_list,
mkldnn_bf16_op_list=self.mkldnn_bf16_op_list)
mkldnn_bf16_op_list=self.mkldnn_bf16_op_list,
min_subgraph_size=self.min_subgraph_size,
dynamic_shape_info=self.dynamic_shape_info)
if self._client_config is None:
self._client_config = service_handler.get_client_config(
)
......@@ -766,7 +778,9 @@ class Op(object):
self.ir_optim, self.precision, self.use_mkldnn,
self.mkldnn_cache_capacity, self.mkldnn_op_list,
self.mkldnn_bf16_op_list, self.is_jump_op(),
self.get_output_channels_of_jump_ops()))
self.get_output_channels_of_jump_ops(),
self.min_subgraph_size,
self.dynamic_shape_info))
p.daemon = True
p.start()
process.append(p)
......@@ -803,7 +817,9 @@ class Op(object):
self.ir_optim, self.precision, self.use_mkldnn,
self.mkldnn_cache_capacity, self.mkldnn_op_list,
self.mkldnn_bf16_op_list, self.is_jump_op(),
self.get_output_channels_of_jump_ops()))
self.get_output_channels_of_jump_ops(),
self.min_subgraph_size,
self.dynamic_shape_info))
# When a process exits, it attempts to terminate
# all of its daemonic child processes.
t.daemon = True
......@@ -1264,7 +1280,7 @@ class Op(object):
is_thread_op, trace_buffer, model_config, workdir, thread_num,
device_type, devices, mem_optim, ir_optim, precision, use_mkldnn,
mkldnn_cache_capacity, mkldnn_op_list, mkldnn_bf16_op_list,
is_jump_op, output_channels_of_jump_ops):
is_jump_op, output_channels_of_jump_ops, min_subgraph_size, dynamic_shape_info):
"""
_run() is the entry function of OP process / thread model.When client
type is local_predictor in process mode, the CUDA environment needs to
......@@ -1316,7 +1332,9 @@ class Op(object):
use_mkldnn=use_mkldnn,
mkldnn_cache_capacity=mkldnn_cache_capacity,
mkldnn_op_list=mkldnn_op_list,
mkldnn_bf16_op_list=mkldnn_bf16_op_list)
mkldnn_bf16_op_list=mkldnn_bf16_op_list,
min_subgraph_size=min_subgraph_size,
dynamic_shape_info=dynamic_shape_info)
_LOGGER.info("Init cuda env in process {}".format(
concurrency_idx))
......
......@@ -260,6 +260,7 @@ class PipelineServer(object):
"use_calib": False,
"use_mkldnn": False,
"mkldnn_cache_capacity": 0,
"min_subgraph_size": 3,
},
}
for op in self._used_op:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册