提交 9a2ffb15 编写于 作者: T TeslaZhao

Update doc

上级 4de287c6
......@@ -30,7 +30,7 @@ message( "WITH_GPU = ${WITH_GPU}")
# Paddle Version should be one of:
# latest: latest develop build
# version number like 1.5.2
SET(PADDLE_VERSION "2.3.0")
SET(PADDLE_VERSION "2.3.0-no-ort")
if (WITH_GPU)
message("CUDA: ${CUDA_VERSION}, CUDNN_MAJOR_VERSION: ${CUDNN_MAJOR_VERSION}")
# cuda 11.0 is not supported, 11.2 would be added.
......@@ -171,25 +171,25 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mklml/lib)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib")
LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib")
LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib)
#SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib")
#LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib")
LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib)
#SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib")
#LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib)
if (NOT WITH_MKLML)
ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)
endif()
ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib/libpaddle2onnx.so)
#ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
#SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib/libpaddle2onnx.so)
ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib/libonnxruntime.so.1.10.0)
#ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
#SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib/libonnxruntime.so.1.10.0)
ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so)
SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a)
if (WITH_ASCEND_CL)
SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so)
endif()
......
......@@ -92,7 +92,6 @@ message Response {
repeated int64 profile_time = 2;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 5;
// Error messages
......
......@@ -51,17 +51,14 @@ message EngineDesc {
/*
* "gpu_memory_mb": allocate gpu memory by config.EnableUseGpu()
* "cpu_math_thread_num": set thread numbers of cpu math by
* config.SetCpuMathLibraryNumThreads()
* "trt_workspace_size": set TensorRT workspace size by
* config.EnableTensorRtEngine(), 1 << 25 default
* "trt_use_static": If true, save the optimization information of the TRT
* serialized to the disk, and load from the disk.
* "cpu_math_thread_num": set thread numbers of cpu math by config.SetCpuMathLibraryNumThreads()
* "trt_workspace_size": set TensorRT workspace size by config.EnableTensorRtEngine(), 1 << 25 default
* "trt_use_static": If true, save the optimization information of the TRT serialized to the disk, and load from the disk.
*/
optional int32 gpu_memory_mb = 22 [ default = 100 ];
optional int32 cpu_math_thread_num = 23 [ default = 1 ];
optional int32 trt_workspace_size = 24 [ default = 33554432 ];
optional bool trt_use_static = 25 [ default = false ];
optional int32 gpu_memory_mb = 22 [default = 100];
optional int32 cpu_math_thread_num = 23 [default = 1];
optional int32 trt_workspace_size = 24 [default = 33554432];
optional bool trt_use_static = 25 [default = false];
/*
* "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling
......
......@@ -94,9 +94,9 @@ message Response {
repeated int64 profile_time = 2;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 5;
// Error messages
string err_msg = 6;
};
......
......@@ -94,9 +94,9 @@ message Response {
repeated int64 profile_time = 2;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 5;
// Error messages
string err_msg = 6;
};
......
......@@ -21,6 +21,7 @@ option cc_generic_services = true;
message RequestAndResponse {
required int32 a = 1;
required float b = 2;
required uint64 log_id = 3 [ default = 0 ];
};
service LoadGeneralModelService {
......
......@@ -8,8 +8,8 @@
- [开启同步模式](#2.1)
- [开启异步模式](#2.2)
- [性能测试](#3)
- [测试数据](#3.1)
- [测试结论](#3.2)
- [测试结果](#3.1)
- [测试数据](#3.2)
<a name="1"></a>
......@@ -75,7 +75,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --p
**二.开启异步模式**
启动命令使用 `--runtime_thread_num 4``--batch_infer_size 32` 开启异步模式,Serving 框架会启动8个异步线程,单次合并最大批量为32,自动开启动态 Padding。
启动命令使用 `--runtime_thread_num 2``--batch_infer_size 32` 开启异步模式,Serving 框架会启动2个异步线程,单次合并最大批量为32,自动开启动态 Padding。
```
python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --port 9292 --runtime_thread_num 4 --batch_infer_size 32 --ir_optim --gpu_multi_stream --gpu_ids 0
```
......@@ -84,11 +84,12 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --p
## 性能测试
GPU:Tesla P4 7611 MiB
Cuda:cuda11.2-cudnn8-trt8
Python:python3.7
模型:ResNet_v2_50
测试数据:构造全1输入,单client请求100次,shape 范围(1, 224 ± 50, 224 ± 50)
- GPU:Tesla P4 7611 MiB
- CUDA:cuda11.2-cudnn8-trt8
- Python 版本:python3.7
- 模型:ResNet_v2_50
- 测试数据:构造全1输入,单client请求100次,shape 范围(1, 224 ± 50, 224 ± 50)
同步模式启动命令:
```
......@@ -102,7 +103,25 @@ python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --por
<a name="3.1"></a>
**一.测试数据**
**一.测试结果**
使用异步模式,并开启动态批量后,并发测试不同 shape 数据时,吞吐性能大幅提升。
<div align=center>
<img src='images/6-1_Cpp_Asynchronous_Framwork_CN_1.png' height = "600" align="middle"/>
</div
由于动态批量导致响应时长增长,经过测试,大多数场景下吞吐增量高于响应时长增长,尤其在高并发场景(client=70时),在响应时长增长 33% 情况下,吞吐增加 105%。
|Client |1 |5 |10 | 20 |30 |40 |50 |70 |
|---|---|---|---|---|---|---|---|---|
|QPS |-2.08% |-7.23% |-1.89% |20.55% |23.02% |23.34% |46.41% |105.27% |
|响应时长 | 2.70% |7.09% |5.24% |13.34% |10.80% |43.60% |8.72% |33.89% |
异步模式可有效提升服务吞吐性能。
<a name="3.2"></a>
**二.测试数据**
1. 同步模式
......@@ -147,20 +166,5 @@ python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --por
|50 |1 |1.50 |50.60 |7578 |89.04 |121.545 |5000 |411.364 |331.118 |605.809 |874.543 |1285.650 |48.2343 |41.1369 |9350.0000 |2568777.6400 |295.8593|
|70 |1 |3.80 |83.20 |7602 |89.59 |133.568 |7000 |524.073 |382.653 |799.463 |1202.179 |1576.809 |57.2885 |52.4077 |10761.0000 |3013600.9670 |315.2540|
<a name="3.2"></a>
**二.测试结论**
使用异步模式,并开启动态批量后,并发测试不同 shape 数据时,吞吐性能大幅提升。
<div align=center>
<img src='images/6-1_Cpp_Asynchronous_Framwork_CN_1.png' height = "600" align="middle"/>
</div
由于动态批量导致响应时长增长,经过测试,大多数场景下吞吐增量高于响应时长增长,尤其在高并发场景(client=70时),在响应时长增长 33% 情况下,吞吐增加 105%。
|Client |1 |5 |10 | 20 |30 |40 |50 |70 |
|---|---|---|---|---|---|---|---|---|
|QPS |-2.08% |-7.23% |-1.89% |20.55% |23.02% |23.34% |46.41% |105.27% |
|响应时长 | 2.70% |7.09% |5.24% |13.34% |10.80% |43.60% |8.72% |33.89% |
异步模式可有效提升吞吐性能。
......@@ -33,7 +33,7 @@ C++ Serving 请求和应答的数据格式为 protobuf,重要的结构有以
## Tensor
Tensor 可以装载多种类型的数据,是 Request 和 Response 的基础单元。Tensor 的定义如下:
[Tensor](https://github.com/PaddlePaddle/Serving/blob/develop/core/general-server/proto/general_model_service.proto#L22) 可以装载多种类型的数据,是 Request 和 Response 的基础单元。Tensor 的定义如下:
```protobuf
message Tensor {
......@@ -104,7 +104,7 @@ Tensor 结构中重要成员 `elem_type`、`shape`、`lod` 和 `name/alias_name`
- name/alias_name: 名称及别名,与模型配置对应
- elem_type:数据类型,当前支持FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
- shape:数据维度
- lod:长结构 LoD(Level-of-Detail) Tensor 是 Paddle 的高级特性,是对 Tensor 的一种扩充,用于支持更自由的数据输入。详见[LOD](../LOD_CN.md)
- lod:长结构 LoD(Level-of-Detail) Tensor 是 Paddle 的高级特性,是对 Tensor 的一种扩充,用于支持更自由的数据输入。详见[LOD](../LOD_CN.md)
|elem_type|类型|
|---------|----|
......@@ -146,7 +146,7 @@ tensor->set_alias_name(alias_name);
// 拷贝数据
int total_number = float_data.size();
tensor->mutable_float_data()->Resize(total_number, 0);
memcpy(tensor->mutable_float_data()->mutable_data(), float_datadata(), total_number * sizeof(float));
memcpy(tensor->mutable_float_data()->mutable_data(), float_data.data(), total_number * sizeof(float));
```
<a name="1.2"></a>
......@@ -174,8 +174,7 @@ tensor->set_tensor_content(string_data);
## Request
Request为客户端需要发送的请求数据,其以Tensor为基础数据单元,并包含了额外的请求信息。定义如下:
Request 为客户端需要发送的请求数据,其以 Tensor 为基础数据单元,并包含了额外的请求信息。定义如下:
```protobuf
message Request {
......@@ -186,7 +185,7 @@ message Request {
};
```
- fetch_vat_names: 需要获取的输出数据名称,在GeneralResponseOP会根据该列表进行过滤.请参考模型文件serving_client_conf.prototxt中的`fetch_var`字段下的`alias_name`
- fetch_vat_names: 需要获取的输出数据名称,在 `GeneralResponseOP` 会根据该列表进行过滤.请参考模型文件 `serving_client_conf.prototxt` 中的 `fetch_var` 字段下的 `alias_name`
- profile_server: 调试参数,打开时会输出性能信息
- log_id: 请求ID
......@@ -211,12 +210,14 @@ Tensor *tensor = req.add_tensor();
**二.构建 Json Request**
当使用 RESTful 请求时,可以使用 Json 格式数据,具体格式如下:
当使用 RESTful 请求时,可以使用 Json 格式数据,示例如下:
```JSON
{"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"name":"x","alias_name":"x","shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}
```
可参考示例,不用修改整体结构,仅需修改数据类型和数据。
<a name="3"></a>
## Response
......@@ -242,8 +243,8 @@ message ModelOutput {
Response 结构中核心成员:
- profile_time:当设置 `request->set_profile_server(true)` 时,会返回性能信息
- err_no:错误码,详见 `core/predictor/common/constant.h`
- err_msg:错误信息,详见 `core/predictor/common/constant.h`
- err_no:错误码
- err_msg:错误信息
- engine_name:输出节点名称
|err_no|err_msg|
......
......@@ -38,7 +38,7 @@ Paddle Serving 的 ABTest 功能是基于 PYTHON SDK 和 多个服务端构成
**一.安装 Paddle Serving Wheels**
使用 ABTest 功能的前提是使用 PYTHON SDK,因此需要安装 `paddle_serving_client` 的 wheel 包。安装方法如下:
使用 ABTest 功能的前提是使用 PYTHON SDK,因此需要安装 `paddle_serving_client` 的 wheel 包。[安装方法](./2-1_Docker_Images_CN.md) 如下:
```
pip3 install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
......@@ -48,7 +48,7 @@ pip3 install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/s
**二.下载多个模型并保存模型参数**
本示例已提供了一键下载脚本 `sh get_data.sh`,下载 `bow``cnn``lstm` 3种不同方式训练的模型。
本示例已提供了一键下载脚本 `sh get_data.sh`,下载自训练的模型 `bow``cnn``lstm` 3种不同方式训练的模型。
```
sh get_data.sh
......
# Python Pipeline 性能测试
- [测试环境](#1)
- [性能指标](#2)
- [性能指标与结论](#2)
<a name="1"></a>
......@@ -18,7 +18,12 @@
<a name="2"></a>
## 性能指标
## 性能指标与结论
通过测试,使用 Python Pipeline 模式通过多进程并发,充分利用 GPU 显卡,具有较好的吞吐性能。
测试数据如下:
|model_name |thread_num |batch_size |CPU_util(%) |GPU_memory(mb) |GPU_util(%) |qps(samples/s) |total count |mean(ms) |median(ms) |80 percent(ms) |90 percent(ms) |99 percent(ms) |total cost(s) |each cost(s)|
|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--
......
......@@ -121,7 +121,7 @@ class Server(object):
self.trt_dynamic_shape_info = []
self.gpu_memory_mb = 50
self.cpu_math_thread_num = 1
self.trt_workspace_size = 33554432 # 1 << 25
self.trt_workspace_size = 33554432 # 1 << 25
self.trt_use_static = False
def get_fetch_list(self, infer_node_idx=-1):
......@@ -358,10 +358,10 @@ class Server(object):
engine.use_xpu = self.use_xpu
engine.use_ascend_cl = self.use_ascend_cl
engine.use_gpu = False
engine.gpu_memory_mb = self.gpu_memory_mb
engine.cpu_math_thread_num = self.cpu_math_thread_num
engine.trt_workspace_size = self.trt_workspace_size
engine.trt_use_static = self.trt_use_static
#engine.gpu_memory_mb = self.gpu_memory_mb
#engine.cpu_math_thread_num = self.cpu_math_thread_num
#engine.trt_workspace_size = self.trt_workspace_size
#engine.trt_use_static = self.trt_use_static
# use distributed model.
if self.dist_subgraph_index >= 0:
......
......@@ -42,6 +42,13 @@ message Request {
message Response {
repeated ModelOutput outputs = 1;
repeated int64 profile_time = 2;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 5;
// Error messages
string err_msg = 6;
};
message ModelOutput {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册