提交 81593989 编写于 作者: H HexToString

fix_encryption and doc

......@@ -31,8 +31,7 @@ message( "WITH_GPU = ${WITH_GPU}")
# Paddle Version should be one of:
# latest: latest develop build
# version number like 1.5.2
#SET(PADDLE_VERSION "2.0.0-rc1")
SET(PADDLE_VERSION "latest")
SET(PADDLE_VERSION "2.0.0")
if (WITH_GPU)
if (WITH_TRT)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7-avx-mkl-trt6")
......@@ -136,8 +135,8 @@ if (WITH_TRT)
endif()
if (WITH_LITE)
ADD_LIBRARY(paddle_api_full_bundled STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle_api_full_bundled PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/lite/cxx/lib/libpaddle_api_full_bundled.a)
ADD_LIBRARY(paddle_full_api_shared STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle_full_api_shared PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/lite/cxx/lib/libpaddle_full_api_shared.so)
if (WITH_XPU)
ADD_LIBRARY(xpuapi SHARED IMPORTED GLOBAL)
......@@ -160,7 +159,7 @@ LIST(APPEND paddle_depend_libs
xxhash cryptopp)
if(WITH_LITE)
LIST(APPEND paddle_depend_libs paddle_api_full_bundled)
LIST(APPEND paddle_depend_libs paddle_full_api_shared)
if(WITH_XPU)
LIST(APPEND paddle_depend_libs xpuapi xpurt)
endif()
......
......@@ -7,6 +7,7 @@ PROTOBUF_GENERATE_CPP(pdcodegen_proto_srcs pdcodegen_proto_hdrs
LIST(APPEND pdcodegen_srcs ${pdcodegen_proto_srcs})
add_executable(pdcodegen ${pdcodegen_srcs})
add_dependencies(pdcodegen boost)
target_link_libraries(pdcodegen protobuf ${PROTOBUF_PROTOC_LIBRARY})
# install
......
......@@ -34,6 +34,42 @@
**A:** http rpc
## 安装问题
#### Q: pip install安装whl包过程,报错信息如下:
```
Collecting opencv-python
Using cached opencv-python-4.3.0.38.tar.gz (88.0 MB)
Installing build dependencies ... done
Getting requirements to build wheel ... error
ERROR: Command errored out with exit status 1:
command: /home/work/Python-2.7.17/build/bin/python /home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py get_requires_for_build_wheel /tmp/tmpLiweA9
cwd: /tmp/pip-install-_w6AUI/opencv-python
Complete output (22 lines):
Traceback (most recent call last):
File "/home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py", line 280, in <module>
main()
File "/home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py", line 263, in main
json_out['return_val'] = hook(**hook_input['kwargs'])
File "/home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py", line 114, in get_requires_for_build_wheel
return hook(config_settings)
File "/tmp/pip-build-env-AUCbP4/overlay/lib/python2.7/site-packages/setuptools/build_meta.py", line 146, in get_requires_for_build_wheel
return self._get_build_requires(config_settings, requirements=['wheel'])
File "/tmp/pip-build-env-AUCbP4/overlay/lib/python2.7/site-packages/setuptools/build_meta.py", line 127, in _get_build_requires
self.run_setup()
File "/tmp/pip-build-env-AUCbP4/overlay/lib/python2.7/site-packages/setuptools/build_meta.py", line 243, in run_setup
self).run_setup(setup_script=setup_script)
File "/tmp/pip-build-env-AUCbP4/overlay/lib/python2.7/site-packages/setuptools/build_meta.py", line 142, in run_setup
exec(compile(code, __file__, 'exec'), locals())
File "setup.py", line 448, in <module>
main()
File "setup.py", line 99, in main
% {"ext": re.escape(sysconfig.get_config_var("EXT_SUFFIX"))}
File "/home/work/Python-2.7.17/build/lib/python2.7/re.py", line 210, in escape
s = list(pattern)
TypeError: 'NoneType' object is not iterable
```
**A:** 指定opencv-python版本安装,pip install opencv-python==4.2.0.32,再安装whl包
## 编译问题
......
......@@ -2,35 +2,15 @@
([简体中文](./INFERENCE_TO_SERVING_CN.md)|English)
We should know something before converting to serving model
**inference_model_dir**:the directory of Paddle inference model
**serving_client_dir**: the directory of server side configuration
**serving_client_dir**: the directory of client side configuration
**model_filename**: this is model description file whose default value is `__model__`, if it's not default name, set `model_filename` explicitly
**params_filename**: during `save_inference_model` every Variable will be save as a single file. If we have the inference model whose params are compressed into one file, please set `params_filename` explicitly
## Example
``` python
from paddle_serving_client.io import inference_model_to_serving
inference_model_dir = "your_inference_model"
serving_client_dir = "serving_client_dir"
serving_server_dir = "serving_server_dir"
feed_var_names, fetch_var_names = inference_model_to_serving(
inference_model_dir, serving_server_dir, serving_client_dir)
```
if your model file and params file are both standalone, please use the following api.
```
feed_var_names, fetch_var_names = inference_model_to_serving(
inference_model_dir, serving_server_dir, serving_client_dir,
model_filename="model", params_filename="params")
you can use a build-in python module called `paddle_serving_client.convert` to convert it.
```python
python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
```
Arguments are the same as `inference_model_to_serving` API.
| Argument | Type | Default | Description |
|--------------|------|-----------|--------------------------------|
| `dirname` | str | - | Path of saved model files. Program file and parameter files are saved in this directory. |
| `serving_server` | str | `"serving_server"` | The path of model files and configuration files for server. |
| `serving_client` | str | `"serving_client"` | The path of configuration files for client. |
| `model_filename` | str | None | The name of file to load the inference program. If it is None, the default filename `__model__` will be used. |
| `params_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
......@@ -2,32 +2,15 @@
([English](./INFERENCE_TO_SERVING.md)|简体中文)
## 示例
在下列代码中,我们需要知道以下信息。
**模型文件夹**:这个文件夹就是Paddle的inference_model所在的文件夹
**serving_client_dir**: 这个文件夹是inference_model转换成Serving模型后,服务端配置的保存路径
**serving_client_dir**: 这个文件夹是inference_model转换成Serving模型后,客户端配置的保存路径
**模型描述文件**: 模型描述文件也就是`model_filename`默认值为`__model__`,是一个pb2文本文件,如果是别的文件名需要显式指定
**模型参数文件**: 在`save_inference_model`阶段,默认方式是每一个Variable保存一个二进制文件,如果是这种情况就不需要做指定。如果所有参数用压缩成一个文件的形式保存,则需要显式指定`params_filename`
``` python
from paddle_serving_client.io import inference_model_to_serving
inference_model_dir = "your_inference_model"
serving_client_dir = "serving_client_dir"
serving_server_dir = "serving_server_dir"
feed_var_names, fetch_var_names = inference_model_to_serving(
inference_model_dir, serving_server_dir, serving_client_dir)
```
如果模型中有模型描述文件`model_filename` 和 模型参数文件`params_filename`,那么请用
```
feed_var_names, fetch_var_names = inference_model_to_serving(
inference_model_dir, serving_server_dir, serving_client_dir,
model_filename="model", params_filename="params")
你可以使用Paddle Serving提供的名为`paddle_serving_client.convert`的内置模块进行转换。
```python
python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
```
模块参数与`inference_model_to_serving`接口参数相同。
| 参数 | 类型 | 默认值 | 描述 |
|--------------|------|-----------|--------------------------------|
| `dirname` | str | - | 需要转换的模型文件存储路径,Program结构文件和参数文件均保存在此目录。|
| `serving_server` | str | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
| `serving_client` | str | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client |
| `model_filename` | str | None | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None,则使用 `__model__` 作为默认的文件名 |
| `params_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的>二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为None |
......@@ -19,7 +19,9 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-p
#cuda 10.0
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
#cuda10.1 with TensorRT 6
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3-none-any.whl
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post101-py3-none-any.whl
#cuda10.2 with TensorRT 7
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post102-py3-none-any.whl
```
### Python 2
```
......@@ -27,8 +29,11 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py2-none-any.whl
#cuda 10.0
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl
##cuda10.1 with TensorRT 6
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py2-none-any.whl
#cuda10.1 with TensorRT 6
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post101-py2-none-any.whl
#cuda10.2 with TensorRT 7
>>>>>>> fit_a_line
https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post102-py2-none-any.whl
```
## Client
......
......@@ -112,10 +112,8 @@ The second is to deploy GPU Serving and Java Client separately. If they are on t
**It should be noted that in the example, all models(not pipeline) need to use `--use_multilang` to start GRPC multi-programming language support, and the port number is 9393. If you need another port, you need to modify it in the java file**
**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). Pipeline Serving Client for Java is released, the next version multi-thread java client example will be released**
**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). Pipeline Serving Client for Java is released.**
**It should be noted that in the example, Java Pipeline Client code is in path /Java/Examples and /Java/src/main, and the Pipeline server code is in path /python/examples/pipeline/
The Client IP and Port(which is configured in java/examples/src/main/java/PipelineClientExample.java) should be corresponding to the Pipeline Server IP and Port(which is configured in config.yaml)
**
**It should be noted that in the example, Java Pipeline Client code is in path /Java/Examples and /Java/src/main, and the Pipeline server code is in path /python/examples/pipeline/ The Client IP and Port(which is configured in java/examples/src/main/java/PipelineClientExample.java) should be corresponding to the Pipeline Server IP and Port(which is configured in config.yaml) **
......@@ -111,11 +111,9 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
第二种是GPU Serving和Java Client分开部署,如果在同一台宿主机,可以通过ifconfig了解对应容器的IP地址,然后在`examples/src/main/java/PaddleServingClientExample.java`当中对client.connect时的endpoint做修改,然后再编译一次。 或者在docker启动时选择 `--net=host`来绑定docker和宿主机的网络设备,这样不需要定制java代码可以直接运行。
**需要注意的是,在示例中,所有非pipeline模型都需要使用`--use_multilang`来启动GRPC多编程语言支持,以及端口号都是9393,如果需要别的端口,需要在java文件里修改**
**需要注意的是,在示例中,所有非Pipeline模型都需要使用`--use_multilang`来启动GRPC多编程语言支持,以及端口号都是9393,如果需要别的端口,需要在java文件里修改**
**目前Serving已推出Pipeline模式(详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)),面向Java的Pipeline Serving Client已发布。**
**需要注意的是,Java Pipeline Client相关示例在/Java/Examples和/Java/src/main中,对应的Pipeline server在/python/examples/pipeline/中。
注意java/examples/src/main/java/PipelineClientExample.java中的ip和port,需要与/python/examples/pipeline/中对应Pipeline server的config.yaml文件中配置的ip和port相对应**
**需要注意的是,Java Pipeline Client相关示例在/Java/Examples和/Java/src/main中,对应的Pipeline server在/python/examples/pipeline/中
注意java/examples/src/main/java/PipelineClientExample.java中的ip和port,需要与/python/examples/pipeline/中对应Pipeline server的config.yaml文件中配置的ip和port相对应。**
......@@ -128,20 +128,22 @@ class FluidArmAnalysisCore : public FluidFamilyCore {
config.DisableGpu();
config.SetCpuMathLibraryNumThreads(1);
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
if (params.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
}
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
if (params.use_xpu()) {
config.EnableXpu(2 * 1024 * 1024);
}
if (params.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
}
if (params.use_xpu()) {
config.EnableXpu(100);
if (params.enable_ir_optimization()) {
config.SwitchIrOptim(true);
} else {
config.SwitchIrOptim(false);
}
config.SwitchSpecifyInputNames(true);
......@@ -173,6 +175,14 @@ class FluidArmAnalysisDirCore : public FluidFamilyCore {
config.SwitchSpecifyInputNames(true);
config.SetCpuMathLibraryNumThreads(1);
if (params.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
}
if (params.use_xpu()) {
config.EnableXpu(2 * 1024 * 1024);
}
if (params.enable_memory_optimization()) {
config.EnableMemoryOptim();
}
......@@ -183,14 +193,6 @@ class FluidArmAnalysisDirCore : public FluidFamilyCore {
config.SwitchIrOptim(false);
}
if (params.use_lite()) {
config.EnableLiteEngine(PrecisionType::kFloat32, true);
}
if (params.use_xpu()) {
config.EnableXpu(100);
}
AutoLock lock(GlobalPaddleCreateMutex::instance());
_core = CreatePredictor(config);
if (NULL == _core.get()) {
......
......@@ -81,25 +81,45 @@ if (SERVER)
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
elseif(WITH_TRT)
if(CUDA_VERSION EQUAL 10.1)
set(SUFFIX 101)
elseif(CUDA_VERSION EQUAL 10.2)
set(SUFFIX 102)
elseif(CUDA_VERSION EQUAL 11.0)
set(SUFFIX 110)
endif()
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r
${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
"server_gpu" trt
"server_gpu" ${SUFFIX}
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
elseif(WITH_LITE)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r
${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
"server_gpu" arm
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
if(WITH_XPU)
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r
${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
"server_gpu" arm-xpu
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
else()
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
COMMAND cp -r
${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
"server_gpu" arm
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
endif()
else()
add_custom_command(
OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
......
......@@ -3,9 +3,10 @@
([简体中文](./README_CN.md)|English)
In the example, a BERT model is used for semantic understanding prediction, and the text is represented as a vector, which can be used for further analysis and prediction.
If your python version is 3.X, replace the 'pip' field in the following command with 'pip3',replace 'python' with 'python3'.
### Getting Model
method 1:
This example use model [BERT Chinese Model](https://www.paddlepaddle.org.cn/hubdetail?name=bert_chinese_L-12_H-768_A-12&en_category=SemanticModel) from [Paddlehub](https://github.com/PaddlePaddle/PaddleHub).
Install paddlehub first
......@@ -22,11 +23,13 @@ the 128 in the command above means max_seq_len in BERT model, which is the lengt
the config file and model file for server side are saved in the folder bert_seq128_model.
the config file generated for client side is saved in the folder bert_seq128_client.
method 2:
You can also download the above model from BOS(max_seq_len=128). After decompression, the config file and model file for server side are stored in the bert_chinese_L-12_H-768_A-12_model folder, and the config file generated for client side is stored in the bert_chinese_L-12_H-768_A-12_client folder:
```shell
wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz
```
if your model is bert_chinese_L-12_H-768_A-12_model, replace the 'bert_seq128_model' field in the following command with 'bert_chinese_L-12_H-768_A-12_model',replace 'bert_seq128_client' with 'bert_chinese_L-12_H-768_A-12_client'.
### Getting Dict and Sample Dataset
......@@ -36,11 +39,11 @@ sh get_data.sh
this script will download Chinese Dictionary File vocab.txt and Chinese Sample Data data-c.txt
### RPC Inference Service
Run
start cpu inference service,Run
```
python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 #cpu inference service
```
Or
Or,start gpu inference service,Run
```
python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
```
......@@ -59,12 +62,18 @@ head data-c.txt | python bert_client.py --model bert_seq128_client/serving_clien
the client reads data from data-c.txt and send prediction request, the prediction is given by word vector. (Due to massive data in the word vector, we do not print it).
### HTTP Inference Service
start cpu HTTP inference service,Run
```
python bert_web_service.py bert_seq128_model/ 9292 #launch gpu inference service
```
Or,start gpu HTTP inference service,Run
```
export CUDA_VISIBLE_DEVICES=0,1
```
set environmental variable to specify which gpus are used, the command above means gpu 0 and gpu 1 is used.
```
python bert_web_service.py bert_seq128_model/ 9292 #launch gpu inference service
python bert_web_service_gpu.py bert_seq128_model/ 9292 #launch gpu inference service
```
### HTTP Inference
......
......@@ -4,8 +4,9 @@
示例中采用BERT模型进行语义理解预测,将文本表示为向量的形式,可以用来做进一步的分析和预测。
若使用python的版本为3.X, 将以下命令中的pip 替换为pip3, python替换为python3.
### 获取模型
方法1:
示例中采用[Paddlehub](https://github.com/PaddlePaddle/PaddleHub)中的[BERT中文模型](https://www.paddlepaddle.org.cn/hubdetail?name=bert_chinese_L-12_H-768_A-12&en_category=SemanticModel)
请先安装paddlehub
```
......@@ -19,11 +20,15 @@ python prepare_model.py 128
生成server端配置文件与模型文件,存放在bert_seq128_model文件夹。
生成client端配置文件,存放在bert_seq128_client文件夹。
方法2:
您也可以从bos上直接下载上述模型(max_seq_len=128),解压后server端配置文件与模型文件存放在bert_chinese_L-12_H-768_A-12_model文件夹,client端配置文件存放在bert_chinese_L-12_H-768_A-12_client文件夹:
```shell
wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz
```
若使用bert_chinese_L-12_H-768_A-12_model模型,将下面命令中的bert_seq128_model字段替换为bert_chinese_L-12_H-768_A-12_model,bert_seq128_client字段替换为bert_chinese_L-12_H-768_A-12_client.
### 获取词典和样例数据
......@@ -33,13 +38,15 @@ sh get_data.sh
脚本将下载中文词典vocab.txt和中文样例数据data-c.txt
### 启动RPC预测服务
执行
启动cpu预测服务,执行
```
python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 #启动cpu预测服务
```
或者
或者,启动gpu预测服务,执行
```
python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
```
### 执行预测
......@@ -51,17 +58,28 @@ pip install paddle_serving_app
执行
```
head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
```
启动client读取data-c.txt中的数据进行预测,预测结果为文本的向量表示(由于数据较多,脚本中没有将输出进行打印),server端的地址在脚本中修改。
### 启动HTTP预测服务
启动cpu HTTP预测服务,执行
```
python bert_web_service.py bert_seq128_model/ 9292 #启动gpu预测服务
```
或者,启动gpu HTTP预测服务,执行
```
export CUDA_VISIBLE_DEVICES=0,1
```
通过环境变量指定gpu预测服务使用的gpu,示例中指定索引为0和1的两块gpu
```
python bert_web_service.py bert_seq128_model/ 9292 #启动gpu预测服务
python bert_web_service_gpu.py bert_seq128_model/ 9292 #启动gpu预测服务
```
### 执行预测
```
......
# coding=utf-8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_server_gpu.web_service import WebService
from paddle_serving_app.reader import ChineseBertReader
import sys
import os
import numpy as np
class BertService(WebService):
def load(self):
self.reader = ChineseBertReader({
"vocab_file": "vocab.txt",
"max_seq_len": 128
})
def preprocess(self, feed=[], fetch=[]):
feed_res = []
is_batch = False
for ins in feed:
feed_dict = self.reader.process(ins["words"].encode("utf-8"))
for key in feed_dict.keys():
feed_dict[key] = np.array(feed_dict[key]).reshape(
(len(feed_dict[key]), 1))
feed_res.append(feed_dict)
return feed_res, fetch, is_batch
bert_service = BertService(name="bert")
bert_service.load()
bert_service.load_model_config(sys.argv[1])
bert_service.prepare_server(
workdir="workdir", port=int(sys.argv[2]), device="gpu")
bert_service.run_rpc_service()
bert_service.run_web_service()
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing_example/encrypt.tar.gz
tar -xzf encrypt.tar.gz
cp -rvf ../fit_a_line/uci_housing_model .
cp -rvf ../fit_a_line/uci_housing_client .
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
......@@ -34,7 +34,7 @@ python test_client.py uci_housing_client/serving_client_conf.prototxt
Start a web service with default web service hosting modules:
``` shell
python test_server.py
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci
```
### Client prediction
......
......@@ -35,7 +35,7 @@ python test_client.py uci_housing_client/serving_client_conf.prototxt
通过下面的一行代码开启默认web服务:
``` shell
python test_server.py
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci
```
### 客户端预测
......
......@@ -20,6 +20,9 @@ op:
#uci模型路径
model_config: ResNet50_vd_model
#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 1
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0" # "0,1"
......
......@@ -20,7 +20,10 @@ op:
#uci模型路径
model_config: uci_housing_model
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 0
#计算硬件ID,优先由device_type决定硬件类型。devices为""或空缺时为CPU预测;当为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "" # "0,1"
#client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
......
......@@ -20,6 +20,7 @@ import google.protobuf.text_format
import numpy as np
import argparse
import paddle.fluid as fluid
import paddle.inference as inference
from .proto import general_model_config_pb2 as m_config
from paddle.fluid.core import PaddleTensor
from paddle.fluid.core import AnalysisConfig
......@@ -125,14 +126,14 @@ class LocalPredictor(object):
if use_lite:
config.enable_lite_engine(
precision_mode = PrecisionType.Float32,
zero_copy = True,
passes_filter = [],
ops_filter = []
)
precision_mode=inference.PrecisionType.Float32,
zero_copy=True,
passes_filter=[],
ops_filter=[])
if use_xpu:
config.enable_xpu(100 * 1024 * 1024)
# 2MB l3 cache
config.enable_xpu(8 * 1024 * 1024)
self.predictor = create_paddle_predictor(config)
......
......@@ -20,7 +20,7 @@ from paddle_serving_server import OpMaker, OpSeqMaker, Server
from paddle_serving_client import Client
from contextlib import closing
import socket
import numpy as np
from paddle_serving_server import pipeline
from paddle_serving_server.pipeline import Op
......@@ -74,8 +74,8 @@ class WebService(object):
f = open(client_config, 'r')
model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf)
self.feed_names = [var.alias_name for var in model_conf.feed_var]
self.fetch_names = [var.alias_name for var in model_conf.fetch_var]
self.feed_vars = {var.name: var for var in model_conf.feed_var}
self.fetch_vars = {var.name: var for var in model_conf.fetch_var}
def _launch_rpc_service(self):
op_maker = OpMaker()
......@@ -211,6 +211,15 @@ class WebService(object):
def preprocess(self, feed=[], fetch=[]):
print("This API will be deprecated later. Please do not use it")
is_batch = True
feed_dict = {}
for var_name in self.feed_vars.keys():
feed_dict[var_name] = []
for feed_ins in feed:
for key in feed_ins:
feed_dict[key].append(np.array(feed_ins[key]).reshape(list(self.feed_vars[key].shape))[np.newaxis,:])
feed = {}
for key in feed_dict:
feed[key] = np.concatenate(feed_dict[key], axis=0)
return feed, fetch, is_batch
def postprocess(self, feed=[], fetch=[], fetch_map=None):
......
......@@ -217,6 +217,7 @@ class Server(object):
self.module_path = os.path.dirname(paddle_serving_server.__file__)
self.cur_path = os.getcwd()
self.use_local_bin = False
self.device = "cpu"
self.gpuid = 0
self.use_trt = False
self.use_lite = False
......@@ -284,6 +285,9 @@ class Server(object):
"GPU not found, please check your environment or use cpu version by \"pip install paddle_serving_server\""
)
def set_device(self, device="cpu"):
self.device = device
def set_gpuid(self, gpuid=0):
self.gpuid = gpuid
......@@ -316,24 +320,25 @@ class Server(object):
engine.static_optimization = False
engine.force_update_static_cache = False
engine.use_trt = self.use_trt
engine.use_lite = self.use_lite
engine.use_xpu = self.use_xpu
if os.path.exists('{}/__params__'.format(model_config_path)):
suffix = ""
else:
suffix = "_DIR"
if device == "arm":
engine.use_lite = self.use_lite
engine.use_xpu = self.use_xpu
if device == "cpu":
if use_encryption_model:
engine.type = "FLUID_CPU_ANALYSIS_ENCRPT"
else:
engine.type = "FLUID_CPU_ANALYSIS_DIR"
engine.type = "FLUID_CPU_ANALYSIS"+suffix
elif device == "gpu":
if use_encryption_model:
engine.type = "FLUID_GPU_ANALYSIS_ENCRPT"
else:
engine.type = "FLUID_GPU_ANALYSIS_DIR"
engine.type = "FLUID_GPU_ANALYSIS"+suffix
elif device == "arm":
engine.type = "FLUID_ARM_ANALYSIS_DIR"
engine.type = "FLUID_ARM_ANALYSIS" + suffix
self.model_toolkit_conf.engines.extend([engine])
def _prepare_infer_service(self, port):
......@@ -434,9 +439,9 @@ class Server(object):
for line in version_file.readlines():
if re.match("cuda_version", line):
cuda_version = line.split("\"")[1]
if cuda_version == "trt":
if cuda_version == "101" or cuda_version == "102" or cuda_version == "110":
device_version = "serving-gpu-" + cuda_version + "-"
elif cuda_version == "arm":
elif cuda_version == "arm" or cuda_version == "arm-xpu":
device_version = "serving-" + cuda_version + "-"
else:
device_version = "serving-gpu-cuda" + cuda_version + "-"
......@@ -541,7 +546,8 @@ class Server(object):
else:
print("Use local bin : {}".format(self.bin_path))
#self.check_cuda()
if self.use_lite:
# Todo: merge CPU and GPU code, remove device to model_toolkit
if self.device == "cpu" or self.device == "arm":
command = "{} " \
"-enable_model_toolkit " \
"-inferservice_path {} " \
......
......@@ -76,6 +76,7 @@ def start_gpu_card_model(index, gpuid, port, args): # pylint: disable=doc-strin
server.set_lite()
device = "arm"
server.set_device(device)
if args.use_xpu:
server.set_xpu()
......
......@@ -81,8 +81,8 @@ class WebService(object):
f = open(client_config, 'r')
model_conf = google.protobuf.text_format.Merge(
str(f.read()), model_conf)
self.feed_names = [var.alias_name for var in model_conf.feed_var]
self.fetch_names = [var.alias_name for var in model_conf.fetch_var]
self.feed_vars = {var.name: var for var in model_conf.feed_var}
self.fetch_vars = {var.name: var for var in model_conf.fetch_var}
def set_gpus(self, gpus):
print("This API will be deprecated later. Please do not use it")
......@@ -118,6 +118,7 @@ class WebService(object):
server.set_num_threads(thread_num)
server.set_memory_optimize(mem_optim)
server.set_ir_optimize(ir_optim)
server.set_device(device)
if use_lite:
server.set_lite()
......@@ -289,6 +290,15 @@ class WebService(object):
def preprocess(self, feed=[], fetch=[]):
print("This API will be deprecated later. Please do not use it")
is_batch = True
feed_dict = {}
for var_name in self.feed_vars.keys():
feed_dict[var_name] = []
for feed_ins in feed:
for key in feed_ins:
feed_dict[key].append(np.array(feed_ins[key]).reshape(list(self.feed_vars[key].shape))[np.newaxis,:])
feed = {}
for key in feed_dict:
feed[key] = np.concatenate(feed_dict[key], axis=0)
return feed, fetch, is_batch
def postprocess(self, feed=[], fetch=[], fetch_map=None):
......
......@@ -38,14 +38,12 @@ class LocalServiceHandler(object):
client_type='local_predictor',
workdir="",
thread_num=2,
device_type=-1,
devices="",
fetch_names=None,
mem_optim=True,
ir_optim=False,
available_port_generator=None,
use_trt=False,
use_lite=False,
use_xpu=False,
use_profile=False):
"""
Initialization of localservicehandler
......@@ -55,15 +53,14 @@ class LocalServiceHandler(object):
client_type: brpc, grpc and local_predictor[default]
workdir: work directory
thread_num: number of threads, concurrent quantity.
device_type: support multiple devices. -1=Not set, determined by
`devices`. 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
devices: gpu id list[gpu], "" default[cpu]
fetch_names: get fetch names out of LocalServiceHandler in
local_predictor mode. fetch_names_ is compatible for Client().
mem_optim: use memory/graphics memory optimization, True default.
ir_optim: use calculation chart optimization, False default.
available_port_generator: generate available ports
use_trt: use nvidia tensorRt engine, False default.
use_lite: use Paddle-Lite engine, False default.
use_xpu: run predict on Baidu Kunlun, False default.
use_profile: use profiling, False default.
Returns:
......@@ -74,28 +71,61 @@ class LocalServiceHandler(object):
self._model_config = model_config
self._port_list = []
self._device_type = "cpu"
if devices == "":
# cpu
devices = [-1]
if use_lite:
self._device_type = "arm"
self._port_list.append(available_port_generator.next())
_LOGGER.info("Model({}) will be launch in arm device. Port({})"
.format(model_config, self._port_list))
self._device_name = "cpu"
self._use_gpu = False
self._use_trt = False
self._use_lite = False
self._use_xpu = False
if device_type == -1:
# device_type is not set, determined by `devices`,
if devices == "":
# CPU
self._device_name = "cpu"
devices = [-1]
else:
self._device_type = "cpu"
self._port_list.append(available_port_generator.next())
_LOGGER.info("Model({}) will be launch in cpu device. Port({})"
.format(model_config, self._port_list))
else:
# gpu
self._device_type = "gpu"
# GPU
self._device_name = "gpu"
self._use_gpu = True
devices = [int(x) for x in devices.split(",")]
elif device_type == 0:
# CPU
self._device_name = "cpu"
devices = [-1]
elif device_type == 1:
# GPU
self._device_name = "gpu"
self._use_gpu = True
devices = [int(x) for x in devices.split(",")]
elif device_type == 2:
# Nvidia Tensor RT
self._device_name = "gpu"
self._use_gpu = True
devices = [int(x) for x in devices.split(",")]
self._use_trt = True
elif device_type == 3:
# ARM CPU
self._device_name = "arm"
devices = [-1]
self._use_lite = True
elif device_type == 4:
# Kunlun XPU
self._device_name = "arm"
devices = [int(x) for x in devices.split(",")]
self._use_lite = True
self._use_xpu = True
else:
_LOGGER.error(
"LocalServiceHandler initialization fail. device_type={}"
.format(device_type))
if client_type == "brpc" or client_type == "grpc":
for _ in devices:
self._port_list.append(available_port_generator.next())
_LOGGER.info("Model({}) will be launch in gpu device: {}. Port({})"
.format(model_config, devices, self._port_list))
_LOGGER.info("Create ports for devices:{}. Port:{}"
.format(devices, self._port_list))
self._client_type = client_type
self._workdir = workdir
self._devices = devices
......@@ -105,14 +135,21 @@ class LocalServiceHandler(object):
self._local_predictor_client = None
self._rpc_service_list = []
self._server_pros = []
self._use_trt = use_trt
self._use_lite = use_lite
self._use_xpu = use_xpu
self._use_profile = use_profile
self.fetch_names_ = fetch_names
self._fetch_names = fetch_names
_LOGGER.info(
"Models({}) will be launched by device {}. use_gpu:{}, "
"use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, "
"mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
"client_type:{}, fetch_names:{}".format(
model_config, self._device_name, self._use_gpu, self._use_trt,
self._use_lite, self._use_xpu, device_type, self._devices,
self._mem_optim, self._ir_optim, self._use_profile,
self._thread_num, self._client_type, self._fetch_names))
def get_fetch_list(self):
return self.fetch_names_
return self._fetch_names
def get_port_list(self):
return self._port_list
......@@ -149,22 +186,17 @@ class LocalServiceHandler(object):
from paddle_serving_app.local_predict import LocalPredictor
if self._local_predictor_client is None:
self._local_predictor_client = LocalPredictor()
use_gpu = False
use_lite = False
if self._device_type == "gpu":
use_gpu = True
elif self._device_type == "arm":
use_lite = True
self._local_predictor_client.load_model_config(
model_path=self._model_config,
use_gpu=use_gpu,
use_gpu=self._use_gpu,
gpu_id=self._devices[concurrency_idx],
use_profile=self._use_profile,
thread_num=self._thread_num,
mem_optim=self._mem_optim,
ir_optim=self._ir_optim,
use_trt=self._use_trt,
use_lite=use_lite,
use_lite=self._use_lite,
use_xpu=self._use_xpu)
return self._local_predictor_client
......@@ -174,7 +206,7 @@ class LocalServiceHandler(object):
def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim,
ir_optim):
"""
According to _device_type, generating one CpuServer or GpuServer, and
According to self._device_name, generating one Cpu/Gpu/Arm Server, and
setting the model config amd startup params.
Args:
......@@ -188,7 +220,7 @@ class LocalServiceHandler(object):
Returns:
server: CpuServer/GpuServer
"""
if self._device_type == "cpu":
if self._device_name == "cpu":
from paddle_serving_server import OpMaker, OpSeqMaker, Server
op_maker = OpMaker()
read_op = op_maker.create('general_reader')
......@@ -217,6 +249,8 @@ class LocalServiceHandler(object):
server = Server()
if gpuid >= 0:
server.set_gpuid(gpuid)
# TODO: support arm or arm + xpu later
server.set_device(self._device_name)
server.set_op_sequence(op_seq_maker.get_op_sequence())
server.set_num_threads(thread_num)
......@@ -225,9 +259,9 @@ class LocalServiceHandler(object):
server.load_model_config(self._model_config)
server.prepare_server(
workdir=workdir, port=port, device=self._device_type)
if self.fetch_names_ is None:
self.fetch_names_ = server.get_fetch_list()
workdir=workdir, port=port, device=self._device_name)
if self._fetch_names is None:
self._fetch_names = server.get_fetch_list()
return server
def _start_one_server(self, service_idx):
......@@ -264,7 +298,7 @@ class LocalServiceHandler(object):
"""
Start multiple processes and start one server in each process
"""
for i, service in enumerate(self._rpc_service_list):
for i, _ in enumerate(self._rpc_service_list):
p = multiprocessing.Process(
target=self._start_one_server, args=(i, ))
p.daemon = True
......
......@@ -134,6 +134,7 @@ class Op(object):
self.model_config = None
self.workdir = None
self.thread_num = self.concurrency
self.device_type = -1
self.devices = ""
self.mem_optim = False
self.ir_optim = False
......@@ -153,6 +154,7 @@ class Op(object):
self.client_type = local_service_conf.get("client_type")
self.workdir = local_service_conf.get("workdir")
self.thread_num = local_service_conf.get("thread_num")
self.device_type = local_service_conf.get("device_type")
self.devices = local_service_conf.get("devices")
self.mem_optim = local_service_conf.get("mem_optim")
self.ir_optim = local_service_conf.get("ir_optim")
......@@ -168,6 +170,7 @@ class Op(object):
client_type=self.client_type,
workdir=self.workdir,
thread_num=self.thread_num,
device_type=self.device_type,
devices=self.devices,
mem_optim=self.mem_optim,
ir_optim=self.ir_optim)
......@@ -188,8 +191,11 @@ class Op(object):
client_type=self.client_type,
workdir=self.workdir,
thread_num=self.thread_num,
device_type=self.device_type,
devices=self.devices,
fetch_names=self._fetch_names)
fetch_names=self._fetch_names,
mem_optim=self.mem_optim,
ir_optim=self.ir_optim)
if self._client_config is None:
self._client_config = service_handler.get_client_config(
)
......@@ -550,7 +556,8 @@ class Op(object):
args=(concurrency_idx, self._get_input_channel(),
self._get_output_channels(), False, trace_buffer,
self.model_config, self.workdir, self.thread_num,
self.devices, self.mem_optim, self.ir_optim))
self.device_type, self.devices, self.mem_optim,
self.ir_optim))
p.daemon = True
p.start()
process.append(p)
......@@ -583,7 +590,8 @@ class Op(object):
args=(concurrency_idx, self._get_input_channel(),
self._get_output_channels(), True, trace_buffer,
self.model_config, self.workdir, self.thread_num,
self.devices, self.mem_optim, self.ir_optim))
self.device_type, self.devices, self.mem_optim,
self.ir_optim))
# When a process exits, it attempts to terminate
# all of its daemonic child processes.
t.daemon = True
......@@ -991,7 +999,7 @@ class Op(object):
def _run(self, concurrency_idx, input_channel, output_channels,
is_thread_op, trace_buffer, model_config, workdir, thread_num,
devices, mem_optim, ir_optim):
device_type, devices, mem_optim, ir_optim):
"""
_run() is the entry function of OP process / thread model.When client
type is local_predictor in process mode, the CUDA environment needs to
......@@ -1009,6 +1017,7 @@ class Op(object):
model_config: model config path
workdir: work directory
thread_num: number of threads, concurrent quantity
device_type: support multiple devices
devices: gpu id list[gpu], "" default[cpu]
mem_optim: use memory/graphics memory optimization, True default.
ir_optim: use calculation chart optimization, False default.
......@@ -1017,7 +1026,6 @@ class Op(object):
None
"""
op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
tid = threading.current_thread().ident
# init ops
profiler = None
......@@ -1028,6 +1036,7 @@ class Op(object):
client_type="local_predictor",
workdir=workdir,
thread_num=thread_num,
device_type=device_type,
devices=devices,
mem_optim=mem_optim,
ir_optim=ir_optim)
......
......@@ -234,6 +234,7 @@ class PipelineServer(object):
"local_service_conf": {
"workdir": "",
"thread_num": 2,
"device_type": -1,
"devices": "",
"mem_optim": True,
"ir_optim": False,
......@@ -389,6 +390,7 @@ class ServerYamlConfChecker(object):
default_conf = {
"workdir": "",
"thread_num": 2,
"device_type": -1,
"devices": "",
"mem_optim": True,
"ir_optim": False,
......@@ -397,6 +399,7 @@ class ServerYamlConfChecker(object):
"model_config": str,
"workdir": str,
"thread_num": int,
"device_type": int,
"devices": str,
"mem_optim": bool,
"ir_optim": bool,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册