diff --git a/README.md b/README.md index 747c140ded49f279c289b0bc8a3b4b1963243040..9d1ec854ba67d220a481816cda5eeebf2bc89739 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,9 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po | `port` | int | `9292` | Exposed port of current service to users| | `name` | str | `""` | Service name, can be used to generate HTTP request url | | `model` | str | `""` | Path of paddle model directory to be served | -| `mem_optim` | bool | `False` | Enable memory optimization | +| `mem_optim` | bool | `False` | Enable memory / graphic memory optimization | +| `ir_optim` | bool | `False` | Enable analysis and optimization of calculation graph | +| `use_mkl` (Only for cpu version) | bool | `False` | Run inference with MKL | Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/). diff --git a/README_CN.md b/README_CN.md index 266fca330d7597d6188fa0022e6376bc23149c74..0c30ef0cffea7d2940c544c55b641255108908fd 100644 --- a/README_CN.md +++ b/README_CN.md @@ -87,6 +87,8 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po | `name` | str | `""` | Service name, can be used to generate HTTP request url | | `model` | str | `""` | Path of paddle model directory to be served | | `mem_optim` | bool | `False` | Enable memory optimization | +| `ir_optim` | bool | `False` | Enable analysis and optimization of calculation graph | +| `use_mkl` (Only for cpu version) | bool | `False` | Run inference with MKL | 我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求,请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。 diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index 4bdc233099cffbc7949a6b5cf8627fe6461f565c..8956022685090c94be2037445c646e9fbffd1a5c 100644 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -43,6 +43,7 @@ message EngineDesc { optional bool enable_memory_optimization = 13; optional bool static_optimization = 14; optional bool force_update_static_cache = 15; + optional bool enable_ir_optimization = 16; }; // model_toolkit conf diff --git a/core/general-client/src/general_model.cpp b/core/general-client/src/general_model.cpp index 86f75bc1c1b401cd14f2c6651ea52ef08fdb8c40..cab050e732fb701120c7f1a5c80737fc75282324 100644 --- a/core/general-client/src/general_model.cpp +++ b/core/general-client/src/general_model.cpp @@ -345,7 +345,7 @@ int PredictorClient::numpy_predict( PredictorRes &predict_res_batch, const int &pid) { int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size()); - + VLOG(2) << "batch size: " << batch_size; predict_res_batch.clear(); Timer timeline; int64_t preprocess_start = timeline.TimeStampUS(); @@ -462,7 +462,7 @@ int PredictorClient::numpy_predict( for (ssize_t j = 0; j < int_array.shape(1); j++) { for (ssize_t k = 0; k < int_array.shape(2); k++) { for (ssize_t l = 0; k < int_array.shape(3); l++) { - tensor->add_float_data(int_array(i, j, k, l)); + tensor->add_int64_data(int_array(i, j, k, l)); } } } @@ -474,7 +474,7 @@ int PredictorClient::numpy_predict( for (ssize_t i = 0; i < int_array.shape(0); i++) { for (ssize_t j = 0; j < int_array.shape(1); j++) { for (ssize_t k = 0; k < int_array.shape(2); k++) { - tensor->add_float_data(int_array(i, j, k)); + tensor->add_int64_data(int_array(i, j, k)); } } } @@ -484,7 +484,7 @@ int PredictorClient::numpy_predict( auto int_array = int_feed[vec_idx].unchecked<2>(); for (ssize_t i = 0; i < int_array.shape(0); i++) { for (ssize_t j = 0; j < int_array.shape(1); j++) { - tensor->add_float_data(int_array(i, j)); + tensor->add_int64_data(int_array(i, j)); } } break; @@ -492,7 +492,7 @@ int PredictorClient::numpy_predict( case 1: { auto int_array = int_feed[vec_idx].unchecked<1>(); for (ssize_t i = 0; i < int_array.shape(0); i++) { - tensor->add_float_data(int_array(i)); + tensor->add_int64_data(int_array(i)); } break; } diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h index 4bb3be9ad2c3dc7ef94a32200b014325aceedf45..e8c0ff47d86f081516a35576655f843a28b0591b 100644 --- a/core/predictor/framework/infer.h +++ b/core/predictor/framework/infer.h @@ -35,6 +35,7 @@ class InferEngineCreationParams { InferEngineCreationParams() { _path = ""; _enable_memory_optimization = false; + _enable_ir_optimization = false; _static_optimization = false; _force_update_static_cache = false; } @@ -45,10 +46,16 @@ class InferEngineCreationParams { _enable_memory_optimization = enable_memory_optimization; } + void set_enable_ir_optimization(bool enable_ir_optimization) { + _enable_ir_optimization = enable_ir_optimization; + } + bool enable_memory_optimization() const { return _enable_memory_optimization; } + bool enable_ir_optimization() const { return _enable_ir_optimization; } + void set_static_optimization(bool static_optimization = false) { _static_optimization = static_optimization; } @@ -68,6 +75,7 @@ class InferEngineCreationParams { << "model_path = " << _path << ", " << "enable_memory_optimization = " << _enable_memory_optimization << ", " + << "enable_ir_optimization = " << _enable_ir_optimization << ", " << "static_optimization = " << _static_optimization << ", " << "force_update_static_cache = " << _force_update_static_cache; } @@ -75,6 +83,7 @@ class InferEngineCreationParams { private: std::string _path; bool _enable_memory_optimization; + bool _enable_ir_optimization; bool _static_optimization; bool _force_update_static_cache; }; @@ -150,6 +159,11 @@ class ReloadableInferEngine : public InferEngine { force_update_static_cache = conf.force_update_static_cache(); } + if (conf.has_enable_ir_optimization()) { + _infer_engine_params.set_enable_ir_optimization( + conf.enable_ir_optimization()); + } + _infer_engine_params.set_path(_model_data_path); if (enable_memory_optimization) { _infer_engine_params.set_enable_memory_optimization(true); diff --git a/doc/COMPILE.md b/doc/COMPILE.md index 41a79f082494b0ac22bb4479a5d246cdb6882a3d..f61ac061883581090087a2202e694c9a07468c5f 100644 --- a/doc/COMPILE.md +++ b/doc/COMPILE.md @@ -9,14 +9,18 @@ - Golang: 1.9.2 and later - Git:2.17.1 and later - CMake:3.2.2 and later -- Python:2.7.2 and later +- Python:2.7.2 and later / 3.6 and later It is recommended to use Docker for compilation. We have prepared the Paddle Serving compilation environment for you: - CPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-devel`,dockerfile: [Dockerfile.devel](../tools/Dockerfile.devel) - GPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-gpu-devel`,dockerfile: [Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel) -This document will take Python2 as an example to show how to compile Paddle Serving. If you want to compile with Python 3, just adjust the Python options of cmake. +This document will take Python2 as an example to show how to compile Paddle Serving. If you want to compile with Python3, just adjust the Python options of cmake: + +- Set `DPYTHON_INCLUDE_DIR` to `$PYTHONROOT/include/python3.6m/` +- Set `DPYTHON_LIBRARIES` to `$PYTHONROOT/lib64/libpython3.6.so` +- Set `DPYTHON_EXECUTABLE` to `$PYTHONROOT/bin/python3` ## Get Code @@ -54,6 +58,8 @@ make -j10 execute `make install` to put targets under directory `./output` +**Attention:** After the compilation is successful, you need to set the path of `SERVING_BIN`. See [Note](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md#Note) for details. + ## Compile Client ``` shell diff --git a/doc/COMPILE_CN.md b/doc/COMPILE_CN.md index eb334232d98f26e68d719d10cbe458a356738d2f..c6e5426f02335598277ceb40fafc5215c7f03b2b 100644 --- a/doc/COMPILE_CN.md +++ b/doc/COMPILE_CN.md @@ -9,14 +9,18 @@ - Golang: 1.9.2及以上 - Git:2.17.1及以上 - CMake:3.2.2及以上 -- Python:2.7.2及以上 +- Python:2.7.2及以上 / 3.6及以上 推荐使用Docker编译,我们已经为您准备好了Paddle Serving编译环境: - CPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-devel`,dockerfile: [Dockerfile.devel](../tools/Dockerfile.devel) - GPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-gpu-devel`,dockerfile: [Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel) -本文档将以Python2为例介绍如何编译Paddle Serving。如果您想用Python3进行编译,只需要调整cmake的Python相关选项即可。 +本文档将以Python2为例介绍如何编译Paddle Serving。如果您想用Python3进行编译,只需要调整cmake的Python相关选项即可: + +- 将`DPYTHON_INCLUDE_DIR`设置为`$PYTHONROOT/include/python3.6m/` +- 将`DPYTHON_LIBRARIES`设置为`$PYTHONROOT/lib64/libpython3.6.so` +- 将`DPYTHON_EXECUTABLE`设置为`$PYTHONROOT/bin/python3` ## 获取代码 @@ -54,6 +58,8 @@ make -j10 执行`make install`可以把目标产出放在`./output`目录下。 +**注意:** 编译成功后,需要设置`SERVING_BIN`路径,详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。 + ## 编译Client部分 ``` shell diff --git a/doc/PERFORMANCE_OPTIM.md b/doc/PERFORMANCE_OPTIM.md new file mode 100644 index 0000000000000000000000000000000000000000..4b025e94d6f8d3ed69fb76898eb6afada9ca6613 --- /dev/null +++ b/doc/PERFORMANCE_OPTIM.md @@ -0,0 +1,18 @@ +# Performance optimization + +Due to different model structures, different prediction services consume different computing resources when performing predictions. For online prediction services, models that require less computing resources will have a higher proportion of communication time cost, which is called communication-intensive service. Models that require more computing resources have a higher time cost for inference calculations, which is called computationa-intensive services. + +For a prediction service, the easiest way to determine what type it is is to look at the time ratio. Paddle Serving provides [Timeline tool] (../python/examples/util/README_CN.md), which can intuitively display the time spent in each stage of the prediction service. + +For communication-intensive prediction services, requests can be aggregated, and within a limit that can tolerate delay, multiple prediction requests can be combined into a batch for prediction. + +For computation-intensive prediction services, you can use GPU prediction services instead of CPU prediction services, or increase the number of graphics cards for GPU prediction services. + +Under the same conditions, the communication time of the HTTP prediction service provided by Paddle Serving is longer than that of the RPC prediction service, so for communication-intensive services, please give priority to using RPC communication. + +Parameters for performance optimization: + +| Parameters | Type | Default | Description | +| ---------- | ---- | ------- | ------------------------------------------------------------ | +| mem_optim | bool | False | Enable memory / graphic memory optimization | +| ir_optim | bool | Fasle | Enable analysis and optimization of calculation graph,including OP fusion, etc | diff --git a/doc/PERFORMANCE_OPTIM_CN.md b/doc/PERFORMANCE_OPTIM_CN.md index dd17bc8afab8472f8f55b4870f73e4c481e97cd3..7bd64d3e2d645c9328ead55e867d0b97946840ad 100644 --- a/doc/PERFORMANCE_OPTIM_CN.md +++ b/doc/PERFORMANCE_OPTIM_CN.md @@ -1,6 +1,6 @@ # 性能优化 -由于模型结构的不同,在执行预测时不同的预测对计算资源的消耗也不相同,对于在线的预测服务来说,对计算资源要求较少的模型,通信的时间成本占比就会较高,称为通信密集型服务,对计算资源要求较多的模型,推理计算的时间成本较高,称为计算密集型服务。对于这两种服务类型,可以根据实际需求采取不同的方式进行优化 +由于模型结构的不同,在执行预测时不同的预测服务对计算资源的消耗也不相同。对于在线的预测服务来说,对计算资源要求较少的模型,通信的时间成本占比就会较高,称为通信密集型服务,对计算资源要求较多的模型,推理计算的时间成本较高,称为计算密集型服务。对于这两种服务类型,可以根据实际需求采取不同的方式进行优化 对于一个预测服务来说,想要判断属于哪种类型,最简单的方法就是看时间占比,Paddle Serving提供了[Timeline工具](../python/examples/util/README_CN.md),可以直观的展现预测服务中各阶段的耗时。 @@ -10,4 +10,9 @@ 在相同条件下,Paddle Serving提供的HTTP预测服务的通信时间是大于RPC预测服务的,因此对于通信密集型的服务请优先考虑使用RPC的通信方式。 -对于模型较大,预测服务内存或显存占用较多的情况,可以通过将--mem_optim选项设置为True来开启内存/显存优化。 +性能优化相关参数: + +| 参数 | 类型 | 默认值 | 含义 | +| --------- | ---- | ------ | -------------------------------- | +| mem_optim | bool | False | 开启内存/显存优化 | +| ir_optim | bool | Fasle | 开启计算图分析优化,包括OP融合等 | diff --git a/doc/SAVE.md b/doc/SAVE.md index c1e6b19a45c75a64207802984f52c734d44f8fc8..3f7f97e12e1e309ff0933e150ea7bcd23298b60e 100644 --- a/doc/SAVE.md +++ b/doc/SAVE.md @@ -1,8 +1,9 @@ -## How to save a servable model of Paddle Serving? +# How to save a servable model of Paddle Serving? ([简体中文](./SAVE_CN.md)|English) -- Currently, paddle serving provides a save_model interface for users to access, the interface is similar with `save_inference_model` of Paddle. +## Save from training or prediction script +Currently, paddle serving provides a save_model interface for users to access, the interface is similar with `save_inference_model` of Paddle. ``` python import paddle_serving_client.io as serving_io serving_io.save_model("imdb_model", "imdb_client_conf", @@ -29,3 +30,15 @@ for line in sys.stdin: fetch_map = client.predict(feed=feed, fetch=fetch) print("{} {}".format(fetch_map["prediction"][1], label[0])) ``` + +## Export from saved model files +If you have saved model files using Paddle's `save_inference_model` API, you can use Paddle Serving's` inference_model_to_serving` API to convert it into a model file that can be used for Paddle Serving. +``` +import paddle_serving_client.io as serving_io +serving_io.inference_model_to_serving(dirname, model_filename=None, params_filename=None, serving_server="serving_server", serving_client="serving_client") +``` +dirname (str) - Path of saved model files. Program file and parameter files are saved in this directory. +model_filename (str, optional) - The name of file to load the inference program. If it is None, the default filename __model__ will be used. Default: None. +paras_filename (str, optional) - The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None. +serving_server (str, optional) - The path of model files and configuration files for server. Default: "serving_server". +serving_client (str, optional) - The path of configuration files for client. Default: "serving_client". diff --git a/doc/SAVE_CN.md b/doc/SAVE_CN.md index 43b62c2ac623b386505356194ac136ea305fe683..fc75cd8d015a6d6f42a08f29e4035db20f450d91 100644 --- a/doc/SAVE_CN.md +++ b/doc/SAVE_CN.md @@ -1,8 +1,9 @@ -## 怎样保存用于Paddle Serving的模型? +# 怎样保存用于Paddle Serving的模型? (简体中文|[English](./SAVE.md)) -- 目前,Paddle Serving提供了一个save_model接口供用户访问,该接口与Paddle的`save_inference_model`类似。 +## 从训练或预测脚本中保存 +目前,Paddle Serving提供了一个save_model接口供用户访问,该接口与Paddle的`save_inference_model`类似。 ``` python import paddle_serving_client.io as serving_io @@ -29,3 +30,15 @@ for line in sys.stdin: fetch_map = client.predict(feed=feed, fetch=fetch) print("{} {}".format(fetch_map["prediction"][1], label[0])) ``` + +## 从已保存的模型文件中导出 +如果已使用Paddle 的`save_inference_model`接口保存出预测要使用的模型,则可以通过Paddle Serving的`inference_model_to_serving`接口转换成可用于Paddle Serving的模型文件。 +``` +import paddle_serving_client.io as serving_io +serving_io.inference_model_to_serving(dirname, model_filename=None, params_filename=None, serving_server="serving_server", serving_client="serving_client") +``` +dirname (str) – 需要转换的模型文件存储路径,Program结构文件和参数文件均保存在此目录。 +model_filename (str,可选) – 存储需要转换的模型Inference Program结构的文件名称。如果设置为None,则使用 __model__ 作为默认的文件名。默认值为None。 +params_filename (str,可选) – 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为None。默认值为None。 +serving_server (str, 可选) - 转换后的模型文件和配置文件的存储路径。默认值为"serving_server"。 +serving_client (str, 可选) - 转换后的客户端配置文件存储路径。默认值为"serving_client"。 diff --git a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h index 24148e374e51cb42cb0d8d1423e0ca009e9e8294..a4d8dda71a7977185106bb1552cb8f39ef6bc50e 100644 --- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h +++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h @@ -194,6 +194,12 @@ class FluidCpuAnalysisDirCore : public FluidFamilyCore { analysis_config.EnableMemoryOptim(); } + if (params.enable_ir_optimization()) { + analysis_config.SwitchIrOptim(true); + } else { + analysis_config.SwitchIrOptim(false); + } + AutoLock lock(GlobalPaddleCreateMutex::instance()); _core = paddle::CreatePaddlePredictor(analysis_config); diff --git a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h index a3fa365444a40d505b16b22e702d4a8b69699073..2fc6ae587ff26f5f05ff9332f08067ab49d06254 100644 --- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h +++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h @@ -198,6 +198,12 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { analysis_config.EnableMemoryOptim(); } + if (params.enable_ir_optimization()) { + analysis_config.SwitchIrOptim(true); + } else { + analysis_config.SwitchIrOptim(false); + } + AutoLock lock(GlobalPaddleCreateMutex::instance()); _core = paddle::CreatePaddlePredictor(analysis_config); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index c1590fb1b36de669f89711f95c4d49aedadb0c91..07699da458ab62ad1a5b9ece83547799d08f8cf7 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -19,6 +19,8 @@ endif() if (CLIENT) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../tools/python_tag.py + ${CMAKE_CURRENT_BINARY_DIR}/python_tag.py) endif() if (APP) @@ -43,7 +45,8 @@ if (APP) add_custom_command( OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/ - COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel) + COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + DEPENDS ${SERVING_APP_CORE} general_model_config_py_proto ${PY_FILES}) add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp) endif() @@ -52,6 +55,7 @@ add_custom_command( OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/ COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so + COMMAND env ${py_env} ${PYTHON_EXECUTABLE} python_tag.py COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES}) add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp) diff --git a/python/examples/criteo_ctr_with_cube/README.md b/python/examples/criteo_ctr_with_cube/README.md index 056035a68335d88fef813b834538643b8cc04ea0..02125422af7e7ce53a05a1eff9a43159034a79dc 100755 --- a/python/examples/criteo_ctr_with_cube/README.md +++ b/python/examples/criteo_ctr_with_cube/README.md @@ -2,16 +2,6 @@ ([简体中文](./README_CN.md)|English) -### Compile Source Code -in the root directory of this git project -``` -mkdir build_server -cd build_server -cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON .. -make -j10 -make install -j10 -``` - ### Get Sample Dataset go to directory `python/examples/criteo_ctr_with_cube` diff --git a/python/examples/criteo_ctr_with_cube/README_CN.md b/python/examples/criteo_ctr_with_cube/README_CN.md index 2ba22770b770f318ba0c3ef503c34571d2976e8f..3b6f812ca53bd435e9b11b59e2a459c46ee3f864 100644 --- a/python/examples/criteo_ctr_with_cube/README_CN.md +++ b/python/examples/criteo_ctr_with_cube/README_CN.md @@ -1,16 +1,6 @@ ## 带稀疏参数索引服务的CTR预测服务 (简体中文|[English](./README.md)) -### 编译源代码 -在本项目的根目录下,执行 -``` -mkdir build_server -cd build_server -cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON .. -make -j10 -make install -j10 -``` - ### 获取样例数据 进入目录 `python/examples/criteo_ctr_with_cube` ``` diff --git a/python/paddle_serving_app/local_predict.py b/python/paddle_serving_app/local_predict.py index 133aa4ccf32d29538d5b7032874f2c770e55e184..6620994165306a550204498e5185bb3aacca8ffd 100644 --- a/python/paddle_serving_app/local_predict.py +++ b/python/paddle_serving_app/local_predict.py @@ -71,6 +71,7 @@ class Debugger(object): if profile: config.enable_profile() config.set_cpu_math_library_num_threads(cpu_num) + config.switch_ir_optim(False) self.predictor = create_paddle_predictor(config) diff --git a/python/paddle_serving_client/__init__.py b/python/paddle_serving_client/__init__.py index 95d74303dce0d932806e02549ec7a56d2f562446..858f889bf257588cf43e3ab30b1b9330de479ba6 100644 --- a/python/paddle_serving_client/__init__.py +++ b/python/paddle_serving_client/__init__.py @@ -260,10 +260,16 @@ class Client(object): if i == 0: int_feed_names.append(key) if isinstance(feed_i[key], np.ndarray): + if key in self.lod_tensor_set: + raise ValueError( + "LodTensor var can not be ndarray type.") int_shape.append(list(feed_i[key].shape)) else: int_shape.append(self.feed_shapes_[key]) if isinstance(feed_i[key], np.ndarray): + if key in self.lod_tensor_set: + raise ValueError( + "LodTensor var can not be ndarray type.") #int_slot.append(np.reshape(feed_i[key], (-1)).tolist()) int_slot.append(feed_i[key]) self.has_numpy_input = True @@ -274,10 +280,16 @@ class Client(object): if i == 0: float_feed_names.append(key) if isinstance(feed_i[key], np.ndarray): + if key in self.lod_tensor_set: + raise ValueError( + "LodTensor var can not be ndarray type.") float_shape.append(list(feed_i[key].shape)) else: float_shape.append(self.feed_shapes_[key]) if isinstance(feed_i[key], np.ndarray): + if key in self.lod_tensor_set: + raise ValueError( + "LodTensor var can not be ndarray type.") #float_slot.append(np.reshape(feed_i[key], (-1)).tolist()) float_slot.append(feed_i[key]) self.has_numpy_input = True diff --git a/python/paddle_serving_client/io/__init__.py b/python/paddle_serving_client/io/__init__.py index 74a6ca871b5c1e32b3c1ecbc6656c95d7c78a399..4f174866e5521577ba35f39216f7dd0793879a6c 100644 --- a/python/paddle_serving_client/io/__init__.py +++ b/python/paddle_serving_client/io/__init__.py @@ -103,17 +103,21 @@ def save_model(server_model_folder, fout.write(config.SerializeToString()) -def inference_model_to_serving(infer_model, serving_client, serving_server): +def inference_model_to_serving(dirname, + model_filename=None, + params_filename=None, + serving_server="serving_server", + serving_client="serving_client"): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_program, feed_target_names, fetch_targets = \ - fluid.io.load_inference_model(dirname=infer_model, executor=exe) + fluid.io.load_inference_model(dirname=dirname, executor=exe, model_filename=model_filename, params_filename=params_filename) feed_dict = { x: inference_program.global_block().var(x) for x in feed_target_names } fetch_dict = {x.name: x for x in fetch_targets} - save_model(serving_client, serving_server, feed_dict, fetch_dict, + save_model(serving_server, serving_client, feed_dict, fetch_dict, inference_program) feed_names = feed_dict.keys() fetch_names = fetch_dict.keys() diff --git a/python/paddle_serving_server/__init__.py b/python/paddle_serving_server/__init__.py index a58fb11ac3ee1fbe5086ae4381f6d6208c0c73ec..971359fca0df3a122b28889e0711c86364a1c45d 100644 --- a/python/paddle_serving_server/__init__.py +++ b/python/paddle_serving_server/__init__.py @@ -127,6 +127,7 @@ class Server(object): self.model_toolkit_conf = None self.resource_conf = None self.memory_optimization = False + self.ir_optimization = False self.model_conf = None self.workflow_fn = "workflow.prototxt" self.resource_fn = "resource.prototxt" @@ -175,6 +176,9 @@ class Server(object): def set_memory_optimize(self, flag=False): self.memory_optimization = flag + def set_ir_optimize(self, flag=False): + self.ir_optimization = flag + def check_local_bin(self): if "SERVING_BIN" in os.environ: self.use_local_bin = True @@ -195,6 +199,7 @@ class Server(object): engine.enable_batch_align = 0 engine.model_data_path = model_config_path engine.enable_memory_optimization = self.memory_optimization + engine.enable_ir_optimization = self.ir_optimization engine.static_optimization = False engine.force_update_static_cache = False @@ -244,7 +249,7 @@ class Server(object): workflow_oi_config_path = None if isinstance(model_config_paths, str): # If there is only one model path, use the default infer_op. - # Because there are several infer_op type, we need to find + # Because there are several infer_op type, we need to find # it from workflow_conf. default_engine_names = [ 'general_infer_0', 'general_dist_kv_infer_0', @@ -284,8 +289,8 @@ class Server(object): # check config here # print config here - def use_mkl(self): - self.mkl_flag = True + def use_mkl(self, flag): + self.mkl_flag = flag def get_device_version(self): avx_flag = False @@ -300,6 +305,10 @@ class Server(object): else: device_version = "serving-cpu-avx-openblas-" else: + if mkl_flag: + print( + "Your CPU does not support AVX, server will running with noavx-openblas mode." + ) device_version = "serving-cpu-noavx-openblas-" return device_version diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py index 395177a8c77e5c608c2e0364b1d43ac534172d66..70aafbf5c3da4d1a2a8ec50ce5a2258383863057 100644 --- a/python/paddle_serving_server/serve.py +++ b/python/paddle_serving_server/serve.py @@ -41,6 +41,9 @@ def parse_args(): # pylint: disable=doc-string-missing "--device", type=str, default="cpu", help="Type of device") parser.add_argument( "--mem_optim", type=bool, default=False, help="Memory optimize") + parser.add_argument( + "--ir_optim", type=bool, default=False, help="Graph optimize") + parser.add_argument("--use_mkl", type=bool, default=False, help="Use MKL") parser.add_argument( "--max_body_size", type=int, @@ -57,7 +60,9 @@ def start_standard_model(): # pylint: disable=doc-string-missing workdir = args.workdir device = args.device mem_optim = args.mem_optim + ir_optim = args.ir_optim max_body_size = args.max_body_size + use_mkl = args.use_mkl if model == "": print("You must specify your serving model") @@ -78,6 +83,8 @@ def start_standard_model(): # pylint: disable=doc-string-missing server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_num_threads(thread_num) server.set_memory_optimize(mem_optim) + server.set_ir_optimize(ir_optim) + server.use_mkl(use_mkl) server.set_max_body_size(max_body_size) server.set_port(port) diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py index 5fa4f010f2112bd400b81ba2f616e4ebe963a810..5a06bd712a836617047b0cc947956fc5d2213daa 100644 --- a/python/paddle_serving_server_gpu/__init__.py +++ b/python/paddle_serving_server_gpu/__init__.py @@ -47,6 +47,8 @@ def serve_args(): "--name", type=str, default="None", help="Default service name") parser.add_argument( "--mem_optim", type=bool, default=False, help="Memory optimize") + parser.add_argument( + "--ir_optim", type=bool, default=False, help="Graph optimize") parser.add_argument( "--max_body_size", type=int, @@ -156,6 +158,7 @@ class Server(object): self.model_toolkit_conf = None self.resource_conf = None self.memory_optimization = False + self.ir_optimization = False self.model_conf = None self.workflow_fn = "workflow.prototxt" self.resource_fn = "resource.prototxt" @@ -204,6 +207,9 @@ class Server(object): def set_memory_optimize(self, flag=False): self.memory_optimization = flag + def set_ir_optimize(self, flag=False): + self.ir_optimization = flag + def check_local_bin(self): if "SERVING_BIN" in os.environ: self.use_local_bin = True @@ -240,6 +246,7 @@ class Server(object): engine.enable_batch_align = 0 engine.model_data_path = model_config_path engine.enable_memory_optimization = self.memory_optimization + engine.enable_ir_optimization = self.ir_optimization engine.static_optimization = False engine.force_update_static_cache = False diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py index 512b5ec0a7d15a030afdcaa5e8daa344b29fb96e..297ff25d2084bead186fa4b9037e5de8282df0fe 100644 --- a/python/paddle_serving_server_gpu/serve.py +++ b/python/paddle_serving_server_gpu/serve.py @@ -35,6 +35,7 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss thread_num = args.thread model = args.model mem_optim = args.mem_optim + ir_optim = args.ir_optim max_body_size = args.max_body_size workdir = "{}_{}".format(args.workdir, gpuid) @@ -57,6 +58,7 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss server.set_op_sequence(op_seq_maker.get_op_sequence()) server.set_num_threads(thread_num) server.set_memory_optimize(mem_optim) + server.set_ir_optimize(ir_optim) server.set_max_body_size(max_body_size) server.load_model_config(model) diff --git a/tools/Dockerfile.centos6.devel b/tools/Dockerfile.centos6.devel index dd519a0a08bd9fc02b7ad51d248912c2a22a811d..dd5a2ef786ed8a9c239a99cabbcfe2d482e6341c 100644 --- a/tools/Dockerfile.centos6.devel +++ b/tools/Dockerfile.centos6.devel @@ -21,7 +21,7 @@ RUN yum -y install wget && \ wget https://www.python.org/ftp/python/2.7.5/Python-2.7.5.tgz && \ tar -zxf Python-2.7.5.tgz && \ cd Python-2.7.5 && \ - ./configure --prefix=/usr/local/python2.7 --enable-shared && \ + ./configure --prefix=/usr/local/python2.7 --enable-shared --enable-unicode=ucs4 && \ make all && make install && \ make clean && \ echo 'export PATH=/usr/local/python2.7/bin:$PATH' >> /root/.bashrc && \ diff --git a/tools/Dockerfile.centos6.gpu.devel b/tools/Dockerfile.centos6.gpu.devel index 3288f09d4cacc8aa7fa0bd112dc6bf97939ecde5..c34780c151e960134af5f8b448e0465b8285e8b2 100644 --- a/tools/Dockerfile.centos6.gpu.devel +++ b/tools/Dockerfile.centos6.gpu.devel @@ -21,7 +21,7 @@ RUN yum -y install wget && \ wget https://www.python.org/ftp/python/2.7.5/Python-2.7.5.tgz && \ tar -zxf Python-2.7.5.tgz && \ cd Python-2.7.5 && \ - ./configure --prefix=/usr/local/python2.7 --enable-shared && \ + ./configure --prefix=/usr/local/python2.7 --enable-shared --enable-unicode=ucs4 && \ make all && make install && \ make clean && \ echo 'export PATH=/usr/local/python2.7/bin:$PATH' >> /root/.bashrc && \ diff --git a/tools/python_tag.py b/tools/python_tag.py new file mode 100644 index 0000000000000000000000000000000000000000..75947cff0b1b39d4c262a306bbe2bc878ae7d3ba --- /dev/null +++ b/tools/python_tag.py @@ -0,0 +1,20 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag +import re +with open("setup.cfg", "w") as f: + line = "[bdist_wheel]\npython-tag={0}{1}\nplat-name=linux_x86_64".format( + get_abbr_impl(), get_impl_ver()) + f.write(line)