提交 b306084f 编写于 作者: H HexToString

Merge branch 'develop' of https://github.com/PaddlePaddle/Serving into merge_branch

......@@ -37,9 +37,24 @@ using paddle_infer::Tensor;
using paddle_infer::CreatePredictor;
DECLARE_int32(gpuid);
DECLARE_string(precision);
DECLARE_bool(use_calib);
static const int max_batch = 32;
static const int min_subgraph_size = 3;
static PrecisionType precision_type;
PrecisionType GetPrecision(const std::string& precision_data) {
std::string precision_type = predictor::ToLower(precision_data);
if (precision_type == "fp32") {
return PrecisionType::kFloat32;
} else if (precision_type == "int8") {
return PrecisionType::kInt8;
} else if (precision_type == "fp16") {
return PrecisionType::kHalf;
}
return PrecisionType::kFloat32;
}
// Engine Base
class EngineCore {
......@@ -137,6 +152,7 @@ class PaddleInferenceEngine : public EngineCore {
// 2000MB GPU memory
config.EnableUseGpu(2000, FLAGS_gpuid);
}
precision_type = GetPrecision(FLAGS_precision);
if (engine_conf.has_use_trt() && engine_conf.use_trt()) {
if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) {
......@@ -145,14 +161,24 @@ class PaddleInferenceEngine : public EngineCore {
config.EnableTensorRtEngine(1 << 20,
max_batch,
min_subgraph_size,
Config::Precision::kFloat32,
precision_type,
false,
false);
FLAGS_use_calib);
LOG(INFO) << "create TensorRT predictor";
}
if (engine_conf.has_use_lite() && engine_conf.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
config.EnableLiteEngine(precision_type, true);
}
if ((!engine_conf.has_use_lite() && !engine_conf.has_use_gpu()) ||
(engine_conf.has_use_lite() && !engine_conf.use_lite() &&
engine_conf.has_use_gpu() && !engine_conf.use_gpu())) {
if (precision_type == PrecisionType::kInt8) {
config.EnableMkldnnQuantizer();
} else if (precision_type == PrecisionType::kHalf) {
config.EnableMkldnnBfloat16();
}
}
if (engine_conf.has_use_xpu() && engine_conf.use_xpu()) {
......@@ -171,7 +197,6 @@ class PaddleInferenceEngine : public EngineCore {
config.EnableMemoryOptim();
}
predictor::AutoLock lock(predictor::GlobalCreateMutex::instance());
_predictor = CreatePredictor(config);
if (NULL == _predictor.get()) {
......
......@@ -20,6 +20,8 @@ namespace paddle_serving {
namespace inference {
DEFINE_int32(gpuid, 0, "GPU device id to use");
DEFINE_string(precision, "fp32", "precision to deploy, default is fp32");
DEFINE_bool(use_calib, false, "calibration mode, default is false");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<PaddleInferenceEngine>,
......
......@@ -51,6 +51,16 @@ def serve_args():
"--name", type=str, default="None", help="Default service name")
parser.add_argument(
"--use_mkl", default=False, action="store_true", help="Use MKL")
parser.add_argument(
"--precision",
type=str,
default="fp32",
help="precision mode(fp32, int8, fp16, bf16)")
parser.add_argument(
"--use_calib",
default=False,
action="store_true",
help="Use TensorRT Calibration")
parser.add_argument(
"--mem_optim_off",
default=False,
......@@ -147,6 +157,8 @@ def start_standard_model(serving_port): # pylint: disable=doc-string-missing
server.use_mkl(use_mkl)
server.set_max_body_size(max_body_size)
server.set_port(port)
server.set_precision(args.precision)
server.set_use_calib(args.use_calib)
server.use_encryption_model(use_encryption_model)
if args.product_name != None:
server.set_product_name(args.product_name)
......@@ -209,6 +221,8 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(thread_num)
server.use_mkl(use_mkl)
server.set_precision(args.precision)
server.set_use_calib(args.use_calib)
server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim)
server.set_max_body_size(max_body_size)
......@@ -396,7 +410,9 @@ if __name__ == "__main__":
use_lite=args.use_lite,
use_xpu=args.use_xpu,
ir_optim=args.ir_optim,
thread_num=args.thread)
thread_num=args.thread,
precision=args.precision,
use_calib=args.use_calib)
web_service.run_rpc_service()
app_instance = Flask(__name__)
......
......@@ -115,7 +115,9 @@ class WebService(object):
mem_optim=True,
use_lite=False,
use_xpu=False,
ir_optim=False):
ir_optim=False,
precision="fp32",
use_calib=False):
device = "gpu"
if gpuid == -1:
if use_lite:
......@@ -146,6 +148,8 @@ class WebService(object):
server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim)
server.set_device(device)
server.set_precision(precision)
server.set_use_calib(use_calib)
if use_lite:
server.set_lite()
......@@ -166,6 +170,8 @@ class WebService(object):
workdir="",
port=9393,
device="gpu",
precision="fp32",
use_calib=False,
use_lite=False,
use_xpu=False,
ir_optim=False,
......@@ -197,7 +203,9 @@ class WebService(object):
mem_optim=mem_optim,
use_lite=use_lite,
use_xpu=use_xpu,
ir_optim=ir_optim))
ir_optim=ir_optim,
precision=precision,
use_calib=use_calib))
else:
for i, gpuid in enumerate(self.gpus):
self.rpc_service_list.append(
......@@ -209,7 +217,9 @@ class WebService(object):
mem_optim=mem_optim,
use_lite=use_lite,
use_xpu=use_xpu,
ir_optim=ir_optim))
ir_optim=ir_optim,
precision=precision,
use_calib=use_calib))
def _launch_web_service(self):
gpu_num = len(self.gpus)
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册