提交 d60b154a 编写于 作者: T TeslaZhao

Modify PIPELINE DOCs & Pipeline Serving supports low precision inference

上级 f92e6a52
此差异已折叠。
此差异已折叠。
......@@ -31,3 +31,11 @@ op:
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["score"]
#precsion, 预测精度,降低预测精度可提升推理速度
#GPU 支持: "fp32"(default), "fp16", "int8";
#CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8"
precision: "fp16"
#ir_optim开关
ir_optim: False
......@@ -31,3 +31,11 @@ op:
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["price"]
#precsion, 预测精度,降低预测精度可提升预测速度
#GPU 支持: "fp32"(default), "fp16", "int8";
#CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8"
precision: "FP16"
#ir_optim开关
ir_optim: False
......@@ -119,8 +119,11 @@ class LocalPredictor(object):
self.fetch_names_to_type_[var.alias_name] = var.fetch_type
precision_type = paddle_infer.PrecisionType.Float32
if precision.lower() in precision_map:
if precision is not None and precision.lower() in precision_map:
precision_type = precision_map[precision.lower()]
else:
logger.warning("precision error!!! Please check precision:{}".
format(precision))
if use_profile:
config.enable_profile()
if mem_optim:
......@@ -156,8 +159,11 @@ class LocalPredictor(object):
if not use_gpu and not use_lite:
if precision_type == paddle_infer.PrecisionType.Int8:
config.enable_quantizer()
if precision.lower() == "bf16":
logger.warning(
"PRECISION INT8 is not supported in CPU right now! Please use fp16 or bf16."
)
#config.enable_quantizer()
if precision is not None and precision.lower() == "bf16":
config.enable_mkldnn_bfloat16()
self.predictor = paddle_infer.create_predictor(config)
......
......@@ -44,7 +44,8 @@ class LocalServiceHandler(object):
mem_optim=True,
ir_optim=False,
available_port_generator=None,
use_profile=False):
use_profile=False,
precision="fp32"):
"""
Initialization of localservicehandler
......@@ -62,6 +63,7 @@ class LocalServiceHandler(object):
ir_optim: use calculation chart optimization, False default.
available_port_generator: generate available ports
use_profile: use profiling, False default.
precision: inference precesion, e.g. "fp32", "fp16", "int8"
Returns:
None
......@@ -137,16 +139,17 @@ class LocalServiceHandler(object):
self._server_pros = []
self._use_profile = use_profile
self._fetch_names = fetch_names
self._precision = precision
_LOGGER.info(
"Models({}) will be launched by device {}. use_gpu:{}, "
"use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, "
"mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
"client_type:{}, fetch_names:{}".format(
"client_type:{}, fetch_names:{} precision:{}".format(
model_config, self._device_name, self._use_gpu, self._use_trt,
self._use_lite, self._use_xpu, device_type, self._devices,
self._mem_optim, self._ir_optim, self._use_profile,
self._thread_num, self._client_type, self._fetch_names))
self._use_lite, self._use_xpu, device_type, self._devices, self.
_mem_optim, self._ir_optim, self._use_profile, self._thread_num,
self._client_type, self._fetch_names, self._precision))
def get_fetch_list(self):
return self._fetch_names
......@@ -197,14 +200,15 @@ class LocalServiceHandler(object):
ir_optim=self._ir_optim,
use_trt=self._use_trt,
use_lite=self._use_lite,
use_xpu=self._use_xpu)
use_xpu=self._use_xpu,
precision=self._precision)
return self._local_predictor_client
def get_client_config(self):
return os.path.join(self._model_config, "serving_server_conf.prototxt")
def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim,
ir_optim):
ir_optim, precision):
"""
According to self._device_name, generating one Cpu/Gpu/Arm Server, and
setting the model config amd startup params.
......@@ -216,6 +220,7 @@ class LocalServiceHandler(object):
thread_num: thread num
mem_optim: use memory/graphics memory optimization
ir_optim: use calculation chart optimization
precision: inference precison, e.g."fp32", "fp16", "int8"
Returns:
server: CpuServer/GpuServer
......@@ -256,6 +261,7 @@ class LocalServiceHandler(object):
server.set_num_threads(thread_num)
server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim)
server.set_precision(precision)
server.load_model_config(self._model_config)
server.prepare_server(
......@@ -292,7 +298,8 @@ class LocalServiceHandler(object):
device_id,
thread_num=self._thread_num,
mem_optim=self._mem_optim,
ir_optim=self._ir_optim))
ir_optim=self._ir_optim,
precision=self._precision))
def start_server(self):
"""
......
......@@ -138,6 +138,7 @@ class Op(object):
self.devices = ""
self.mem_optim = False
self.ir_optim = False
self.precision = "fp32"
if self._server_endpoints is None:
server_endpoints = conf.get("server_endpoints", [])
if len(server_endpoints) != 0:
......@@ -159,6 +160,7 @@ class Op(object):
self.mem_optim = local_service_conf.get("mem_optim")
self.ir_optim = local_service_conf.get("ir_optim")
self._fetch_names = local_service_conf.get("fetch_list")
self.precision = local_service_conf.get("precision")
if self.model_config is None:
self.with_serving = False
else:
......@@ -173,7 +175,8 @@ class Op(object):
device_type=self.device_type,
devices=self.devices,
mem_optim=self.mem_optim,
ir_optim=self.ir_optim)
ir_optim=self.ir_optim,
precision=self.precision)
service_handler.prepare_server() # get fetch_list
serivce_ports = service_handler.get_port_list()
self._server_endpoints = [
......@@ -195,7 +198,8 @@ class Op(object):
devices=self.devices,
fetch_names=self._fetch_names,
mem_optim=self.mem_optim,
ir_optim=self.ir_optim)
ir_optim=self.ir_optim,
precision=self.precision)
if self._client_config is None:
self._client_config = service_handler.get_client_config(
)
......@@ -560,7 +564,7 @@ class Op(object):
self._get_output_channels(), False, trace_buffer,
self.model_config, self.workdir, self.thread_num,
self.device_type, self.devices, self.mem_optim,
self.ir_optim))
self.ir_optim, self.precision))
p.daemon = True
p.start()
process.append(p)
......@@ -594,7 +598,7 @@ class Op(object):
self._get_output_channels(), True, trace_buffer,
self.model_config, self.workdir, self.thread_num,
self.device_type, self.devices, self.mem_optim,
self.ir_optim))
self.ir_optim, self.precision))
# When a process exits, it attempts to terminate
# all of its daemonic child processes.
t.daemon = True
......@@ -1064,7 +1068,7 @@ class Op(object):
def _run(self, concurrency_idx, input_channel, output_channels,
is_thread_op, trace_buffer, model_config, workdir, thread_num,
device_type, devices, mem_optim, ir_optim):
device_type, devices, mem_optim, ir_optim, precision):
"""
_run() is the entry function of OP process / thread model.When client
type is local_predictor in process mode, the CUDA environment needs to
......@@ -1086,6 +1090,7 @@ class Op(object):
devices: gpu id list[gpu], "" default[cpu]
mem_optim: use memory/graphics memory optimization, True default.
ir_optim: use calculation chart optimization, False default.
precision: inference precision, e.g. "fp32", "fp16", "int8"
Returns:
None
......@@ -1104,7 +1109,8 @@ class Op(object):
device_type=device_type,
devices=devices,
mem_optim=mem_optim,
ir_optim=ir_optim)
ir_optim=ir_optim,
precision=precision)
_LOGGER.info("Init cuda env in process {}".format(
concurrency_idx))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册