提交 241e53da 编写于 作者: H HexToString

merge from origin

上级 0a8e26d4
......@@ -13,8 +13,10 @@
// limitations under the License.
#pragma once
#include <string>
#include <algorithm>
#include <cctype>
#include <fstream>
#include <string>
#include "core/predictor/common/inner_common.h"
#include "core/predictor/common/macros.h"
......@@ -26,6 +28,38 @@ namespace predictor {
namespace butil = base;
#endif
enum class Precision {
kUnk = -1, // unknown type
kFloat32 = 0, // fp32
kInt8, // int8
kHalf, // fp16
kBfloat16, // bf16
};
static std::string PrecisionTypeString(const Precision data_type) {
switch (data_type) {
case Precision::kFloat32:
return "kFloat32";
case Precision::kInt8:
return "kInt8";
case Precision::kHalf:
return "kHalf";
case Precision::kBfloat16:
return "kBloat16";
default:
return "unUnk";
}
}
static std::string ToLower(const std::string& data) {
std::string result = data;
std::transform(
result.begin(), result.end(), result.begin(), [](unsigned char c) {
return tolower(c);
});
return result;
}
class TimerFlow {
public:
static const int MAX_SIZE = 1024;
......
......@@ -27,6 +27,12 @@ logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("LocalPredictor")
logger.setLevel(logging.INFO)
precision_map = {
'int8': paddle_infer.PrecisionType.Int8,
'fp32': paddle_infer.PrecisionType.Float32,
'fp16': paddle_infer.PrecisionType.Half,
}
class LocalPredictor(object):
"""
......@@ -56,6 +62,8 @@ class LocalPredictor(object):
use_trt=False,
use_lite=False,
use_xpu=False,
precision="fp32",
use_calib=False,
use_feed_fetch_ops=False):
"""
Load model configs and create the paddle predictor by Paddle Inference API.
......@@ -71,6 +79,8 @@ class LocalPredictor(object):
use_trt: use nvidia TensorRT optimization, False default
use_lite: use Paddle-Lite Engint, False default
use_xpu: run predict on Baidu Kunlun, False default
precision: precision mode, "fp32" default
use_calib: use TensorRT calibration, False default
use_feed_fetch_ops: use feed/fetch ops, False default.
"""
client_config = "{}/serving_server_conf.prototxt".format(model_path)
......@@ -88,9 +98,11 @@ class LocalPredictor(object):
logger.info(
"LocalPredictor load_model_config params: model_path:{}, use_gpu:{},\
gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
use_trt:{}, use_lite:{}, use_xpu: {}, use_feed_fetch_ops:{}".format(
model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
ir_optim, use_trt, use_lite, use_xpu, use_feed_fetch_ops))
use_trt:{}, use_lite:{}, use_xpu: {}, precision: {}, use_calib: {},\
use_feed_fetch_ops:{}"
.format(model_path, use_gpu, gpu_id, use_profile, thread_num,
mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision,
use_calib, use_feed_fetch_ops))
self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
......@@ -106,6 +118,9 @@ class LocalPredictor(object):
self.fetch_names_to_idx_[var.alias_name] = i
self.fetch_names_to_type_[var.alias_name] = var.fetch_type
precision_type = paddle_infer.PrecisionType.Float32
if precision.lower() in precision_map:
precision_type = precision_map[precision.lower()]
if use_profile:
config.enable_profile()
if mem_optim:
......@@ -121,6 +136,7 @@ class LocalPredictor(object):
config.enable_use_gpu(100, gpu_id)
if use_trt:
config.enable_tensorrt_engine(
precision_mode=precision_type,
workspace_size=1 << 20,
max_batch_size=32,
min_subgraph_size=3,
......@@ -129,7 +145,7 @@ class LocalPredictor(object):
if use_lite:
config.enable_lite_engine(
precision_mode=paddle_infer.PrecisionType.Float32,
precision_mode=precision_type,
zero_copy=True,
passes_filter=[],
ops_filter=[])
......@@ -138,6 +154,11 @@ class LocalPredictor(object):
# 2MB l3 cache
config.enable_xpu(8 * 1024 * 1024)
if not use_gpu and not use_lite:
if precision_type == paddle_infer.PrecisionType.Int8:
config.enable_quantizer()
if precision.lower() == "bf16":
config.enable_mkldnn_bfloat16()
self.predictor = paddle_infer.create_predictor(config)
def predict(self, feed=None, fetch=None, batch=False, log_id=0):
......
......@@ -71,6 +71,8 @@ class Server(object):
self.max_concurrency = 0
self.num_threads = 2
self.port = 8080
self.precision = "fp32"
self.use_calib = False
self.reload_interval_s = 10
self.max_body_size = 64 * 1024 * 1024
self.module_path = os.path.dirname(paddle_serving_server.__file__)
......@@ -113,6 +115,12 @@ class Server(object):
def set_port(self, port):
self.port = port
def set_precision(self, precision="fp32"):
self.precision = precision
def set_use_calib(self, use_calib=False):
self.use_calib = use_calib
def set_reload_interval(self, interval):
self.reload_interval_s = interval
......@@ -186,6 +194,10 @@ class Server(object):
engine.use_trt = self.use_trt
engine.use_lite = self.use_lite
engine.use_xpu = self.use_xpu
engine.use_gpu = False
if self.device == "gpu":
engine.use_gpu = True
if os.path.exists('{}/__params__'.format(model_config_path)):
engine.combined_model = True
else:
......@@ -472,6 +484,8 @@ class Server(object):
"-max_concurrency {} " \
"-num_threads {} " \
"-port {} " \
"-precision {} " \
"-use_calib {} " \
"-reload_interval_s {} " \
"-resource_path {} " \
"-resource_file {} " \
......@@ -485,6 +499,8 @@ class Server(object):
self.max_concurrency,
self.num_threads,
self.port,
self.precision,
self.use_calib,
self.reload_interval_s,
self.workdir,
self.resource_fn,
......@@ -500,6 +516,8 @@ class Server(object):
"-max_concurrency {} " \
"-num_threads {} " \
"-port {} " \
"-precision {} " \
"-use_calib {} " \
"-reload_interval_s {} " \
"-resource_path {} " \
"-resource_file {} " \
......@@ -514,6 +532,8 @@ class Server(object):
self.max_concurrency,
self.num_threads,
self.port,
self.precision,
self.use_calib,
self.reload_interval_s,
self.workdir,
self.resource_fn,
......@@ -562,6 +582,12 @@ class MultiLangServer(object):
def set_port(self, port):
self.gport_ = port
def set_precision(self, precision="fp32"):
self.precision = precision
def set_use_calib(self, use_calib=False):
self.use_calib = use_calib
def set_reload_interval(self, interval):
self.bserver_.set_reload_interval(interval)
......
......@@ -238,6 +238,8 @@ class PipelineServer(object):
"devices": "",
"mem_optim": True,
"ir_optim": False,
"precision": "fp32",
"use_calib": False,
},
}
for op in self._used_op:
......@@ -394,6 +396,8 @@ class ServerYamlConfChecker(object):
"devices": "",
"mem_optim": True,
"ir_optim": False,
"precision": "fp32",
"use_calib": False,
}
conf_type = {
"model_config": str,
......@@ -403,6 +407,8 @@ class ServerYamlConfChecker(object):
"devices": str,
"mem_optim": bool,
"ir_optim": bool,
"precision": str,
"use_calib": bool,
}
conf_qualification = {"thread_num": (">=", 1), }
ServerYamlConfChecker.check_conf(conf, default_conf, conf_type,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册