提交 d60b154a 编写于 作者: T TeslaZhao

Modify PIPELINE DOCs & Pipeline Serving supports low precision inference

上级 f92e6a52
此差异已折叠。
此差异已折叠。
...@@ -31,3 +31,11 @@ op: ...@@ -31,3 +31,11 @@ op:
#Fetch结果列表,以client_config中fetch_var的alias_name为准 #Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["score"] fetch_list: ["score"]
#precsion, 预测精度,降低预测精度可提升推理速度
#GPU 支持: "fp32"(default), "fp16", "int8";
#CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8"
precision: "fp16"
#ir_optim开关
ir_optim: False
...@@ -30,4 +30,12 @@ op: ...@@ -30,4 +30,12 @@ op:
client_type: local_predictor client_type: local_predictor
#Fetch结果列表,以client_config中fetch_var的alias_name为准 #Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["price"] fetch_list: ["price"]
#precsion, 预测精度,降低预测精度可提升预测速度
#GPU 支持: "fp32"(default), "fp16", "int8";
#CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8"
precision: "FP16"
#ir_optim开关
ir_optim: False
...@@ -119,8 +119,11 @@ class LocalPredictor(object): ...@@ -119,8 +119,11 @@ class LocalPredictor(object):
self.fetch_names_to_type_[var.alias_name] = var.fetch_type self.fetch_names_to_type_[var.alias_name] = var.fetch_type
precision_type = paddle_infer.PrecisionType.Float32 precision_type = paddle_infer.PrecisionType.Float32
if precision.lower() in precision_map: if precision is not None and precision.lower() in precision_map:
precision_type = precision_map[precision.lower()] precision_type = precision_map[precision.lower()]
else:
logger.warning("precision error!!! Please check precision:{}".
format(precision))
if use_profile: if use_profile:
config.enable_profile() config.enable_profile()
if mem_optim: if mem_optim:
...@@ -156,8 +159,11 @@ class LocalPredictor(object): ...@@ -156,8 +159,11 @@ class LocalPredictor(object):
if not use_gpu and not use_lite: if not use_gpu and not use_lite:
if precision_type == paddle_infer.PrecisionType.Int8: if precision_type == paddle_infer.PrecisionType.Int8:
config.enable_quantizer() logger.warning(
if precision.lower() == "bf16": "PRECISION INT8 is not supported in CPU right now! Please use fp16 or bf16."
)
#config.enable_quantizer()
if precision is not None and precision.lower() == "bf16":
config.enable_mkldnn_bfloat16() config.enable_mkldnn_bfloat16()
self.predictor = paddle_infer.create_predictor(config) self.predictor = paddle_infer.create_predictor(config)
......
...@@ -44,7 +44,8 @@ class LocalServiceHandler(object): ...@@ -44,7 +44,8 @@ class LocalServiceHandler(object):
mem_optim=True, mem_optim=True,
ir_optim=False, ir_optim=False,
available_port_generator=None, available_port_generator=None,
use_profile=False): use_profile=False,
precision="fp32"):
""" """
Initialization of localservicehandler Initialization of localservicehandler
...@@ -62,6 +63,7 @@ class LocalServiceHandler(object): ...@@ -62,6 +63,7 @@ class LocalServiceHandler(object):
ir_optim: use calculation chart optimization, False default. ir_optim: use calculation chart optimization, False default.
available_port_generator: generate available ports available_port_generator: generate available ports
use_profile: use profiling, False default. use_profile: use profiling, False default.
precision: inference precesion, e.g. "fp32", "fp16", "int8"
Returns: Returns:
None None
...@@ -137,16 +139,17 @@ class LocalServiceHandler(object): ...@@ -137,16 +139,17 @@ class LocalServiceHandler(object):
self._server_pros = [] self._server_pros = []
self._use_profile = use_profile self._use_profile = use_profile
self._fetch_names = fetch_names self._fetch_names = fetch_names
self._precision = precision
_LOGGER.info( _LOGGER.info(
"Models({}) will be launched by device {}. use_gpu:{}, " "Models({}) will be launched by device {}. use_gpu:{}, "
"use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, " "use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, "
"mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, " "mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
"client_type:{}, fetch_names:{}".format( "client_type:{}, fetch_names:{} precision:{}".format(
model_config, self._device_name, self._use_gpu, self._use_trt, model_config, self._device_name, self._use_gpu, self._use_trt,
self._use_lite, self._use_xpu, device_type, self._devices, self._use_lite, self._use_xpu, device_type, self._devices, self.
self._mem_optim, self._ir_optim, self._use_profile, _mem_optim, self._ir_optim, self._use_profile, self._thread_num,
self._thread_num, self._client_type, self._fetch_names)) self._client_type, self._fetch_names, self._precision))
def get_fetch_list(self): def get_fetch_list(self):
return self._fetch_names return self._fetch_names
...@@ -197,14 +200,15 @@ class LocalServiceHandler(object): ...@@ -197,14 +200,15 @@ class LocalServiceHandler(object):
ir_optim=self._ir_optim, ir_optim=self._ir_optim,
use_trt=self._use_trt, use_trt=self._use_trt,
use_lite=self._use_lite, use_lite=self._use_lite,
use_xpu=self._use_xpu) use_xpu=self._use_xpu,
precision=self._precision)
return self._local_predictor_client return self._local_predictor_client
def get_client_config(self): def get_client_config(self):
return os.path.join(self._model_config, "serving_server_conf.prototxt") return os.path.join(self._model_config, "serving_server_conf.prototxt")
def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim, def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim,
ir_optim): ir_optim, precision):
""" """
According to self._device_name, generating one Cpu/Gpu/Arm Server, and According to self._device_name, generating one Cpu/Gpu/Arm Server, and
setting the model config amd startup params. setting the model config amd startup params.
...@@ -216,6 +220,7 @@ class LocalServiceHandler(object): ...@@ -216,6 +220,7 @@ class LocalServiceHandler(object):
thread_num: thread num thread_num: thread num
mem_optim: use memory/graphics memory optimization mem_optim: use memory/graphics memory optimization
ir_optim: use calculation chart optimization ir_optim: use calculation chart optimization
precision: inference precison, e.g."fp32", "fp16", "int8"
Returns: Returns:
server: CpuServer/GpuServer server: CpuServer/GpuServer
...@@ -256,6 +261,7 @@ class LocalServiceHandler(object): ...@@ -256,6 +261,7 @@ class LocalServiceHandler(object):
server.set_num_threads(thread_num) server.set_num_threads(thread_num)
server.set_memory_optimize(mem_optim) server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim) server.set_ir_optimize(ir_optim)
server.set_precision(precision)
server.load_model_config(self._model_config) server.load_model_config(self._model_config)
server.prepare_server( server.prepare_server(
...@@ -292,7 +298,8 @@ class LocalServiceHandler(object): ...@@ -292,7 +298,8 @@ class LocalServiceHandler(object):
device_id, device_id,
thread_num=self._thread_num, thread_num=self._thread_num,
mem_optim=self._mem_optim, mem_optim=self._mem_optim,
ir_optim=self._ir_optim)) ir_optim=self._ir_optim,
precision=self._precision))
def start_server(self): def start_server(self):
""" """
......
...@@ -138,6 +138,7 @@ class Op(object): ...@@ -138,6 +138,7 @@ class Op(object):
self.devices = "" self.devices = ""
self.mem_optim = False self.mem_optim = False
self.ir_optim = False self.ir_optim = False
self.precision = "fp32"
if self._server_endpoints is None: if self._server_endpoints is None:
server_endpoints = conf.get("server_endpoints", []) server_endpoints = conf.get("server_endpoints", [])
if len(server_endpoints) != 0: if len(server_endpoints) != 0:
...@@ -159,6 +160,7 @@ class Op(object): ...@@ -159,6 +160,7 @@ class Op(object):
self.mem_optim = local_service_conf.get("mem_optim") self.mem_optim = local_service_conf.get("mem_optim")
self.ir_optim = local_service_conf.get("ir_optim") self.ir_optim = local_service_conf.get("ir_optim")
self._fetch_names = local_service_conf.get("fetch_list") self._fetch_names = local_service_conf.get("fetch_list")
self.precision = local_service_conf.get("precision")
if self.model_config is None: if self.model_config is None:
self.with_serving = False self.with_serving = False
else: else:
...@@ -173,7 +175,8 @@ class Op(object): ...@@ -173,7 +175,8 @@ class Op(object):
device_type=self.device_type, device_type=self.device_type,
devices=self.devices, devices=self.devices,
mem_optim=self.mem_optim, mem_optim=self.mem_optim,
ir_optim=self.ir_optim) ir_optim=self.ir_optim,
precision=self.precision)
service_handler.prepare_server() # get fetch_list service_handler.prepare_server() # get fetch_list
serivce_ports = service_handler.get_port_list() serivce_ports = service_handler.get_port_list()
self._server_endpoints = [ self._server_endpoints = [
...@@ -195,7 +198,8 @@ class Op(object): ...@@ -195,7 +198,8 @@ class Op(object):
devices=self.devices, devices=self.devices,
fetch_names=self._fetch_names, fetch_names=self._fetch_names,
mem_optim=self.mem_optim, mem_optim=self.mem_optim,
ir_optim=self.ir_optim) ir_optim=self.ir_optim,
precision=self.precision)
if self._client_config is None: if self._client_config is None:
self._client_config = service_handler.get_client_config( self._client_config = service_handler.get_client_config(
) )
...@@ -560,7 +564,7 @@ class Op(object): ...@@ -560,7 +564,7 @@ class Op(object):
self._get_output_channels(), False, trace_buffer, self._get_output_channels(), False, trace_buffer,
self.model_config, self.workdir, self.thread_num, self.model_config, self.workdir, self.thread_num,
self.device_type, self.devices, self.mem_optim, self.device_type, self.devices, self.mem_optim,
self.ir_optim)) self.ir_optim, self.precision))
p.daemon = True p.daemon = True
p.start() p.start()
process.append(p) process.append(p)
...@@ -594,7 +598,7 @@ class Op(object): ...@@ -594,7 +598,7 @@ class Op(object):
self._get_output_channels(), True, trace_buffer, self._get_output_channels(), True, trace_buffer,
self.model_config, self.workdir, self.thread_num, self.model_config, self.workdir, self.thread_num,
self.device_type, self.devices, self.mem_optim, self.device_type, self.devices, self.mem_optim,
self.ir_optim)) self.ir_optim, self.precision))
# When a process exits, it attempts to terminate # When a process exits, it attempts to terminate
# all of its daemonic child processes. # all of its daemonic child processes.
t.daemon = True t.daemon = True
...@@ -1064,7 +1068,7 @@ class Op(object): ...@@ -1064,7 +1068,7 @@ class Op(object):
def _run(self, concurrency_idx, input_channel, output_channels, def _run(self, concurrency_idx, input_channel, output_channels,
is_thread_op, trace_buffer, model_config, workdir, thread_num, is_thread_op, trace_buffer, model_config, workdir, thread_num,
device_type, devices, mem_optim, ir_optim): device_type, devices, mem_optim, ir_optim, precision):
""" """
_run() is the entry function of OP process / thread model.When client _run() is the entry function of OP process / thread model.When client
type is local_predictor in process mode, the CUDA environment needs to type is local_predictor in process mode, the CUDA environment needs to
...@@ -1085,7 +1089,8 @@ class Op(object): ...@@ -1085,7 +1089,8 @@ class Op(object):
device_type: support multiple devices device_type: support multiple devices
devices: gpu id list[gpu], "" default[cpu] devices: gpu id list[gpu], "" default[cpu]
mem_optim: use memory/graphics memory optimization, True default. mem_optim: use memory/graphics memory optimization, True default.
ir_optim: use calculation chart optimization, False default. ir_optim: use calculation chart optimization, False default.
precision: inference precision, e.g. "fp32", "fp16", "int8"
Returns: Returns:
None None
...@@ -1104,7 +1109,8 @@ class Op(object): ...@@ -1104,7 +1109,8 @@ class Op(object):
device_type=device_type, device_type=device_type,
devices=devices, devices=devices,
mem_optim=mem_optim, mem_optim=mem_optim,
ir_optim=ir_optim) ir_optim=ir_optim,
precision=precision)
_LOGGER.info("Init cuda env in process {}".format( _LOGGER.info("Init cuda env in process {}".format(
concurrency_idx)) concurrency_idx))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册