diff --git a/README.md b/README.md index 747c140ded49f279c289b0bc8a3b4b1963243040..84fbf579579194076d9994079628bf056506f4b0 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,8 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po | `port` | int | `9292` | Exposed port of current service to users| | `name` | str | `""` | Service name, can be used to generate HTTP request url | | `model` | str | `""` | Path of paddle model directory to be served | -| `mem_optim` | bool | `False` | Enable memory optimization | +| `mem_optim` | bool | `False` | Enable memory / graphic memory optimization | +| `ir_optim` | bool | `False` | Enable analysis and optimization of calculation graph | Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/). diff --git a/README_CN.md b/README_CN.md index 266fca330d7597d6188fa0022e6376bc23149c74..6e0843e40588bb6f5af91b17c2eb85bf4bebc8e8 100644 --- a/README_CN.md +++ b/README_CN.md @@ -87,6 +87,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po | `name` | str | `""` | Service name, can be used to generate HTTP request url | | `model` | str | `""` | Path of paddle model directory to be served | | `mem_optim` | bool | `False` | Enable memory optimization | +| `ir_optim` | bool | `False` | Enable analysis and optimization of calculation graph | 我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求,请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。 diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index 4bdc233099cffbc7949a6b5cf8627fe6461f565c..8956022685090c94be2037445c646e9fbffd1a5c 100644 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -43,6 +43,7 @@ message EngineDesc { optional bool enable_memory_optimization = 13; optional bool static_optimization = 14; optional bool force_update_static_cache = 15; + optional bool enable_ir_optimization = 16; }; // model_toolkit conf diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h index 4bb3be9ad2c3dc7ef94a32200b014325aceedf45..e8c0ff47d86f081516a35576655f843a28b0591b 100644 --- a/core/predictor/framework/infer.h +++ b/core/predictor/framework/infer.h @@ -35,6 +35,7 @@ class InferEngineCreationParams { InferEngineCreationParams() { _path = ""; _enable_memory_optimization = false; + _enable_ir_optimization = false; _static_optimization = false; _force_update_static_cache = false; } @@ -45,10 +46,16 @@ class InferEngineCreationParams { _enable_memory_optimization = enable_memory_optimization; } + void set_enable_ir_optimization(bool enable_ir_optimization) { + _enable_ir_optimization = enable_ir_optimization; + } + bool enable_memory_optimization() const { return _enable_memory_optimization; } + bool enable_ir_optimization() const { return _enable_ir_optimization; } + void set_static_optimization(bool static_optimization = false) { _static_optimization = static_optimization; } @@ -68,6 +75,7 @@ class InferEngineCreationParams { << "model_path = " << _path << ", " << "enable_memory_optimization = " << _enable_memory_optimization << ", " + << "enable_ir_optimization = " << _enable_ir_optimization << ", " << "static_optimization = " << _static_optimization << ", " << "force_update_static_cache = " << _force_update_static_cache; } @@ -75,6 +83,7 @@ class InferEngineCreationParams { private: std::string _path; bool _enable_memory_optimization; + bool _enable_ir_optimization; bool _static_optimization; bool _force_update_static_cache; }; @@ -150,6 +159,11 @@ class ReloadableInferEngine : public InferEngine { force_update_static_cache = conf.force_update_static_cache(); } + if (conf.has_enable_ir_optimization()) { + _infer_engine_params.set_enable_ir_optimization( + conf.enable_ir_optimization()); + } + _infer_engine_params.set_path(_model_data_path); if (enable_memory_optimization) { _infer_engine_params.set_enable_memory_optimization(true); diff --git a/doc/PERFORMANCE_OPTIM.md b/doc/PERFORMANCE_OPTIM.md new file mode 100644 index 0000000000000000000000000000000000000000..4b025e94d6f8d3ed69fb76898eb6afada9ca6613 --- /dev/null +++ b/doc/PERFORMANCE_OPTIM.md @@ -0,0 +1,18 @@ +# Performance optimization + +Due to different model structures, different prediction services consume different computing resources when performing predictions. For online prediction services, models that require less computing resources will have a higher proportion of communication time cost, which is called communication-intensive service. Models that require more computing resources have a higher time cost for inference calculations, which is called computationa-intensive services. + +For a prediction service, the easiest way to determine what type it is is to look at the time ratio. Paddle Serving provides [Timeline tool] (../python/examples/util/README_CN.md), which can intuitively display the time spent in each stage of the prediction service. + +For communication-intensive prediction services, requests can be aggregated, and within a limit that can tolerate delay, multiple prediction requests can be combined into a batch for prediction. + +For computation-intensive prediction services, you can use GPU prediction services instead of CPU prediction services, or increase the number of graphics cards for GPU prediction services. + +Under the same conditions, the communication time of the HTTP prediction service provided by Paddle Serving is longer than that of the RPC prediction service, so for communication-intensive services, please give priority to using RPC communication. + +Parameters for performance optimization: + +| Parameters | Type | Default | Description | +| ---------- | ---- | ------- | ------------------------------------------------------------ | +| mem_optim | bool | False | Enable memory / graphic memory optimization | +| ir_optim | bool | Fasle | Enable analysis and optimization of calculation graph,including OP fusion, etc | diff --git a/doc/PERFORMANCE_OPTIM_CN.md b/doc/PERFORMANCE_OPTIM_CN.md index dd17bc8afab8472f8f55b4870f73e4c481e97cd3..7bd64d3e2d645c9328ead55e867d0b97946840ad 100644 --- a/doc/PERFORMANCE_OPTIM_CN.md +++ b/doc/PERFORMANCE_OPTIM_CN.md @@ -1,6 +1,6 @@ # 性能优化 -由于模型结构的不同,在执行预测时不同的预测对计算资源的消耗也不相同,对于在线的预测服务来说,对计算资源要求较少的模型,通信的时间成本占比就会较高,称为通信密集型服务,对计算资源要求较多的模型,推理计算的时间成本较高,称为计算密集型服务。对于这两种服务类型,可以根据实际需求采取不同的方式进行优化 +由于模型结构的不同,在执行预测时不同的预测服务对计算资源的消耗也不相同。对于在线的预测服务来说,对计算资源要求较少的模型,通信的时间成本占比就会较高,称为通信密集型服务,对计算资源要求较多的模型,推理计算的时间成本较高,称为计算密集型服务。对于这两种服务类型,可以根据实际需求采取不同的方式进行优化 对于一个预测服务来说,想要判断属于哪种类型,最简单的方法就是看时间占比,Paddle Serving提供了[Timeline工具](../python/examples/util/README_CN.md),可以直观的展现预测服务中各阶段的耗时。 @@ -10,4 +10,9 @@ 在相同条件下,Paddle Serving提供的HTTP预测服务的通信时间是大于RPC预测服务的,因此对于通信密集型的服务请优先考虑使用RPC的通信方式。 -对于模型较大,预测服务内存或显存占用较多的情况,可以通过将--mem_optim选项设置为True来开启内存/显存优化。 +性能优化相关参数: + +| 参数 | 类型 | 默认值 | 含义 | +| --------- | ---- | ------ | -------------------------------- | +| mem_optim | bool | False | 开启内存/显存优化 | +| ir_optim | bool | Fasle | 开启计算图分析优化,包括OP融合等 | diff --git a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h index 24148e374e51cb42cb0d8d1423e0ca009e9e8294..a4d8dda71a7977185106bb1552cb8f39ef6bc50e 100644 --- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h +++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h @@ -194,6 +194,12 @@ class FluidCpuAnalysisDirCore : public FluidFamilyCore { analysis_config.EnableMemoryOptim(); } + if (params.enable_ir_optimization()) { + analysis_config.SwitchIrOptim(true); + } else { + analysis_config.SwitchIrOptim(false); + } + AutoLock lock(GlobalPaddleCreateMutex::instance()); _core = paddle::CreatePaddlePredictor(analysis_config); diff --git a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h index a3fa365444a40d505b16b22e702d4a8b69699073..2fc6ae587ff26f5f05ff9332f08067ab49d06254 100644 --- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h +++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h @@ -198,6 +198,12 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { analysis_config.EnableMemoryOptim(); } + if (params.enable_ir_optimization()) { + analysis_config.SwitchIrOptim(true); + } else { + analysis_config.SwitchIrOptim(false); + } + AutoLock lock(GlobalPaddleCreateMutex::instance()); _core = paddle::CreatePaddlePredictor(analysis_config); diff --git a/python/paddle_serving_server/__init__.py b/python/paddle_serving_server/__init__.py index a58fb11ac3ee1fbe5086ae4381f6d6208c0c73ec..f4f3248c8174487b51554f27182c0c0cd9740d25 100644 --- a/python/paddle_serving_server/__init__.py +++ b/python/paddle_serving_server/__init__.py @@ -127,6 +127,7 @@ class Server(object): self.model_toolkit_conf = None self.resource_conf = None self.memory_optimization = False + self.ir_optimization = False self.model_conf = None self.workflow_fn = "workflow.prototxt" self.resource_fn = "resource.prototxt" @@ -175,6 +176,9 @@ class Server(object): def set_memory_optimize(self, flag=False): self.memory_optimization = flag + def set_ir_optimize(self, flag=False): + self.ir_optimization = flag + def check_local_bin(self): if "SERVING_BIN" in os.environ: self.use_local_bin = True @@ -195,6 +199,7 @@ class Server(object): engine.enable_batch_align = 0 engine.model_data_path = model_config_path engine.enable_memory_optimization = self.memory_optimization + engine.enable_ir_optimization = self.ir_optimization engine.static_optimization = False engine.force_update_static_cache = False @@ -244,7 +249,7 @@ class Server(object): workflow_oi_config_path = None if isinstance(model_config_paths, str): # If there is only one model path, use the default infer_op. - # Because there are several infer_op type, we need to find + # Because there are several infer_op type, we need to find # it from workflow_conf. default_engine_names = [ 'general_infer_0', 'general_dist_kv_infer_0', diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py index 395177a8c77e5c608c2e0364b1d43ac534172d66..58a1301384a7eb9b991139b225294cbe0cb198f1 100644 --- a/python/paddle_serving_server/serve.py +++ b/python/paddle_serving_server/serve.py @@ -41,6 +41,8 @@ def parse_args(): # pylint: disable=doc-string-missing "--device", type=str, default="cpu", help="Type of device") parser.add_argument( "--mem_optim", type=bool, default=False, help="Memory optimize") + parser.add_argument( + "--ir_optim", type=bool, default=False, help="Graph optimize") parser.add_argument( "--max_body_size", type=int, @@ -57,6 +59,7 @@ def start_standard_model(): # pylint: disable=doc-string-missing workdir = args.workdir device = args.device mem_optim = args.mem_optim + ir_optim = args.ir_optim max_body_size = args.max_body_size if model == "": @@ -78,6 +81,7 @@ def start_standard_model(): # pylint: disable=doc-string-missing server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_num_threads(thread_num) server.set_memory_optimize(mem_optim) + server.set_ir_optimize(ir_optim) server.set_max_body_size(max_body_size) server.set_port(port) diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py index 5fa4f010f2112bd400b81ba2f616e4ebe963a810..5a06bd712a836617047b0cc947956fc5d2213daa 100644 --- a/python/paddle_serving_server_gpu/__init__.py +++ b/python/paddle_serving_server_gpu/__init__.py @@ -47,6 +47,8 @@ def serve_args(): "--name", type=str, default="None", help="Default service name") parser.add_argument( "--mem_optim", type=bool, default=False, help="Memory optimize") + parser.add_argument( + "--ir_optim", type=bool, default=False, help="Graph optimize") parser.add_argument( "--max_body_size", type=int, @@ -156,6 +158,7 @@ class Server(object): self.model_toolkit_conf = None self.resource_conf = None self.memory_optimization = False + self.ir_optimization = False self.model_conf = None self.workflow_fn = "workflow.prototxt" self.resource_fn = "resource.prototxt" @@ -204,6 +207,9 @@ class Server(object): def set_memory_optimize(self, flag=False): self.memory_optimization = flag + def set_ir_optimize(self, flag=False): + self.ir_optimization = flag + def check_local_bin(self): if "SERVING_BIN" in os.environ: self.use_local_bin = True @@ -240,6 +246,7 @@ class Server(object): engine.enable_batch_align = 0 engine.model_data_path = model_config_path engine.enable_memory_optimization = self.memory_optimization + engine.enable_ir_optimization = self.ir_optimization engine.static_optimization = False engine.force_update_static_cache = False diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py index 512b5ec0a7d15a030afdcaa5e8daa344b29fb96e..297ff25d2084bead186fa4b9037e5de8282df0fe 100644 --- a/python/paddle_serving_server_gpu/serve.py +++ b/python/paddle_serving_server_gpu/serve.py @@ -35,6 +35,7 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss thread_num = args.thread model = args.model mem_optim = args.mem_optim + ir_optim = args.ir_optim max_body_size = args.max_body_size workdir = "{}_{}".format(args.workdir, gpuid) @@ -57,6 +58,7 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_num_threads(thread_num) server.set_memory_optimize(mem_optim) + server.set_ir_optimize(ir_optim) server.set_max_body_size(max_body_size) server.load_model_config(model)