提交 b306084f 编写于 作者: H HexToString

Merge branch 'develop' of https://github.com/PaddlePaddle/Serving into merge_branch

...@@ -37,9 +37,24 @@ using paddle_infer::Tensor; ...@@ -37,9 +37,24 @@ using paddle_infer::Tensor;
using paddle_infer::CreatePredictor; using paddle_infer::CreatePredictor;
DECLARE_int32(gpuid); DECLARE_int32(gpuid);
DECLARE_string(precision);
DECLARE_bool(use_calib);
static const int max_batch = 32; static const int max_batch = 32;
static const int min_subgraph_size = 3; static const int min_subgraph_size = 3;
static PrecisionType precision_type;
PrecisionType GetPrecision(const std::string& precision_data) {
std::string precision_type = predictor::ToLower(precision_data);
if (precision_type == "fp32") {
return PrecisionType::kFloat32;
} else if (precision_type == "int8") {
return PrecisionType::kInt8;
} else if (precision_type == "fp16") {
return PrecisionType::kHalf;
}
return PrecisionType::kFloat32;
}
// Engine Base // Engine Base
class EngineCore { class EngineCore {
...@@ -137,6 +152,7 @@ class PaddleInferenceEngine : public EngineCore { ...@@ -137,6 +152,7 @@ class PaddleInferenceEngine : public EngineCore {
// 2000MB GPU memory // 2000MB GPU memory
config.EnableUseGpu(2000, FLAGS_gpuid); config.EnableUseGpu(2000, FLAGS_gpuid);
} }
precision_type = GetPrecision(FLAGS_precision);
if (engine_conf.has_use_trt() && engine_conf.use_trt()) { if (engine_conf.has_use_trt() && engine_conf.use_trt()) {
if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) { if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) {
...@@ -145,14 +161,24 @@ class PaddleInferenceEngine : public EngineCore { ...@@ -145,14 +161,24 @@ class PaddleInferenceEngine : public EngineCore {
config.EnableTensorRtEngine(1 << 20, config.EnableTensorRtEngine(1 << 20,
max_batch, max_batch,
min_subgraph_size, min_subgraph_size,
Config::Precision::kFloat32, precision_type,
false, false,
false); FLAGS_use_calib);
LOG(INFO) << "create TensorRT predictor"; LOG(INFO) << "create TensorRT predictor";
} }
if (engine_conf.has_use_lite() && engine_conf.use_lite()) { if (engine_conf.has_use_lite() && engine_conf.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true); config.EnableLiteEngine(precision_type, true);
}
if ((!engine_conf.has_use_lite() && !engine_conf.has_use_gpu()) ||
(engine_conf.has_use_lite() && !engine_conf.use_lite() &&
engine_conf.has_use_gpu() && !engine_conf.use_gpu())) {
if (precision_type == PrecisionType::kInt8) {
config.EnableMkldnnQuantizer();
} else if (precision_type == PrecisionType::kHalf) {
config.EnableMkldnnBfloat16();
}
} }
if (engine_conf.has_use_xpu() && engine_conf.use_xpu()) { if (engine_conf.has_use_xpu() && engine_conf.use_xpu()) {
...@@ -171,7 +197,6 @@ class PaddleInferenceEngine : public EngineCore { ...@@ -171,7 +197,6 @@ class PaddleInferenceEngine : public EngineCore {
config.EnableMemoryOptim(); config.EnableMemoryOptim();
} }
predictor::AutoLock lock(predictor::GlobalCreateMutex::instance()); predictor::AutoLock lock(predictor::GlobalCreateMutex::instance());
_predictor = CreatePredictor(config); _predictor = CreatePredictor(config);
if (NULL == _predictor.get()) { if (NULL == _predictor.get()) {
......
...@@ -20,6 +20,8 @@ namespace paddle_serving { ...@@ -20,6 +20,8 @@ namespace paddle_serving {
namespace inference { namespace inference {
DEFINE_int32(gpuid, 0, "GPU device id to use"); DEFINE_int32(gpuid, 0, "GPU device id to use");
DEFINE_string(precision, "fp32", "precision to deploy, default is fp32");
DEFINE_bool(use_calib, false, "calibration mode, default is false");
REGIST_FACTORY_OBJECT_IMPL_WITH_NAME( REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
::baidu::paddle_serving::predictor::FluidInferEngine<PaddleInferenceEngine>, ::baidu::paddle_serving::predictor::FluidInferEngine<PaddleInferenceEngine>,
......
...@@ -51,6 +51,16 @@ def serve_args(): ...@@ -51,6 +51,16 @@ def serve_args():
"--name", type=str, default="None", help="Default service name") "--name", type=str, default="None", help="Default service name")
parser.add_argument( parser.add_argument(
"--use_mkl", default=False, action="store_true", help="Use MKL") "--use_mkl", default=False, action="store_true", help="Use MKL")
parser.add_argument(
"--precision",
type=str,
default="fp32",
help="precision mode(fp32, int8, fp16, bf16)")
parser.add_argument(
"--use_calib",
default=False,
action="store_true",
help="Use TensorRT Calibration")
parser.add_argument( parser.add_argument(
"--mem_optim_off", "--mem_optim_off",
default=False, default=False,
...@@ -147,6 +157,8 @@ def start_standard_model(serving_port): # pylint: disable=doc-string-missing ...@@ -147,6 +157,8 @@ def start_standard_model(serving_port): # pylint: disable=doc-string-missing
server.use_mkl(use_mkl) server.use_mkl(use_mkl)
server.set_max_body_size(max_body_size) server.set_max_body_size(max_body_size)
server.set_port(port) server.set_port(port)
server.set_precision(args.precision)
server.set_use_calib(args.use_calib)
server.use_encryption_model(use_encryption_model) server.use_encryption_model(use_encryption_model)
if args.product_name != None: if args.product_name != None:
server.set_product_name(args.product_name) server.set_product_name(args.product_name)
...@@ -209,6 +221,8 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin ...@@ -209,6 +221,8 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin
server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(thread_num) server.set_num_threads(thread_num)
server.use_mkl(use_mkl) server.use_mkl(use_mkl)
server.set_precision(args.precision)
server.set_use_calib(args.use_calib)
server.set_memory_optimize(mem_optim) server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim) server.set_ir_optimize(ir_optim)
server.set_max_body_size(max_body_size) server.set_max_body_size(max_body_size)
...@@ -396,7 +410,9 @@ if __name__ == "__main__": ...@@ -396,7 +410,9 @@ if __name__ == "__main__":
use_lite=args.use_lite, use_lite=args.use_lite,
use_xpu=args.use_xpu, use_xpu=args.use_xpu,
ir_optim=args.ir_optim, ir_optim=args.ir_optim,
thread_num=args.thread) thread_num=args.thread,
precision=args.precision,
use_calib=args.use_calib)
web_service.run_rpc_service() web_service.run_rpc_service()
app_instance = Flask(__name__) app_instance = Flask(__name__)
......
...@@ -115,7 +115,9 @@ class WebService(object): ...@@ -115,7 +115,9 @@ class WebService(object):
mem_optim=True, mem_optim=True,
use_lite=False, use_lite=False,
use_xpu=False, use_xpu=False,
ir_optim=False): ir_optim=False,
precision="fp32",
use_calib=False):
device = "gpu" device = "gpu"
if gpuid == -1: if gpuid == -1:
if use_lite: if use_lite:
...@@ -146,6 +148,8 @@ class WebService(object): ...@@ -146,6 +148,8 @@ class WebService(object):
server.set_memory_optimize(mem_optim) server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim) server.set_ir_optimize(ir_optim)
server.set_device(device) server.set_device(device)
server.set_precision(precision)
server.set_use_calib(use_calib)
if use_lite: if use_lite:
server.set_lite() server.set_lite()
...@@ -166,6 +170,8 @@ class WebService(object): ...@@ -166,6 +170,8 @@ class WebService(object):
workdir="", workdir="",
port=9393, port=9393,
device="gpu", device="gpu",
precision="fp32",
use_calib=False,
use_lite=False, use_lite=False,
use_xpu=False, use_xpu=False,
ir_optim=False, ir_optim=False,
...@@ -197,7 +203,9 @@ class WebService(object): ...@@ -197,7 +203,9 @@ class WebService(object):
mem_optim=mem_optim, mem_optim=mem_optim,
use_lite=use_lite, use_lite=use_lite,
use_xpu=use_xpu, use_xpu=use_xpu,
ir_optim=ir_optim)) ir_optim=ir_optim,
precision=precision,
use_calib=use_calib))
else: else:
for i, gpuid in enumerate(self.gpus): for i, gpuid in enumerate(self.gpus):
self.rpc_service_list.append( self.rpc_service_list.append(
...@@ -209,7 +217,9 @@ class WebService(object): ...@@ -209,7 +217,9 @@ class WebService(object):
mem_optim=mem_optim, mem_optim=mem_optim,
use_lite=use_lite, use_lite=use_lite,
use_xpu=use_xpu, use_xpu=use_xpu,
ir_optim=ir_optim)) ir_optim=ir_optim,
precision=precision,
use_calib=use_calib))
def _launch_web_service(self): def _launch_web_service(self):
gpu_num = len(self.gpus) gpu_num = len(self.gpus)
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册