Update doc

9a2ffb15 · TeslaZhao · 4de287c6 · 9a2ffb15 · 9a2ffb15 · 9a2ffb15
12 changed file
--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -30,7 +30,7 @@ message( "WITH_GPU = ${WITH_GPU}")
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-SET(PADDLE_VERSION "2.3.0")
+SET(PADDLE_VERSION "2.3.0-no-ort")
 if (WITH_GPU)
    message("CUDA: ${CUDA_VERSION}, CUDNN_MAJOR_VERSION: ${CUDNN_MAJOR_VERSION}")
    # cuda 11.0 is not supported, 11.2 would be added.
@@ -171,25 +171,25 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mklml/lib)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib")
 LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib")
+#SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib")
-LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib)
+#LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib")
+#SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib")
-LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib)
+#LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib)
 if (NOT WITH_MKLML)
    ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
    SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)
 endif()
-ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
+#ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib/libpaddle2onnx.so)
+#SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib/libpaddle2onnx.so)
-ADD_LIBRARY(onnxruntime  STATIC IMPORTED GLOBAL)
+#ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib/libonnxruntime.so.1.10.0)
+#SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib/libonnxruntime.so.1.10.0)
 ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so)
+SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a)
 if (WITH_ASCEND_CL)
    SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so)
 endif()

--- a/core/configure/proto/general_model_service.proto
+++ b/core/configure/proto/general_model_service.proto
@@ -92,7 +92,6 @@ message Response {
  repeated int64 profile_time = 2;
  bool profile_server = 3;
  uint64 log_id = 4;
  // Error code
  int32 err_no = 5;
  // Error messages

--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -51,17 +51,14 @@ message EngineDesc {
  /*
   * "gpu_memory_mb": allocate gpu memory by config.EnableUseGpu()
-   * "cpu_math_thread_num": set thread numbers of cpu math by
+   * "cpu_math_thread_num": set thread numbers of cpu math by config.SetCpuMathLibraryNumThreads()
-   * config.SetCpuMathLibraryNumThreads()
+   * "trt_workspace_size": set TensorRT workspace size by config.EnableTensorRtEngine(), 1 << 25 default
-   * "trt_workspace_size": set TensorRT workspace size by
+   * "trt_use_static": If true, save the optimization information of the TRT  serialized to the disk, and load from the disk.
-   * config.EnableTensorRtEngine(), 1 << 25 default
-   * "trt_use_static": If true, save the optimization information of the TRT
-   * serialized to the disk, and load from the disk.
   */
-  optional int32 gpu_memory_mb = 22 [ default = 100 ];
+  optional int32 gpu_memory_mb = 22 [default = 100];
-  optional int32 cpu_math_thread_num = 23 [ default = 1 ];
+  optional int32 cpu_math_thread_num = 23 [default = 1];
-  optional int32 trt_workspace_size = 24 [ default = 33554432 ];
+  optional int32 trt_workspace_size = 24 [default = 33554432];
-  optional bool trt_use_static = 25 [ default = false ];
+  optional bool trt_use_static = 25 [default = false];
  /*
   * "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling

--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -94,9 +94,9 @@ message Response {
  repeated int64 profile_time = 2;
  bool profile_server = 3;
  uint64 log_id = 4;
  // Error code
  int32 err_no = 5;
  // Error messages
  string err_msg = 6;
 };

--- a/core/sdk-cpp/proto/general_model_service.proto
+++ b/core/sdk-cpp/proto/general_model_service.proto
@@ -94,9 +94,9 @@ message Response {
  repeated int64 profile_time = 2;
  bool profile_server = 3;
  uint64 log_id = 4;
  // Error code
  int32 err_no = 5;
  // Error messages
  string err_msg = 6;
 };

--- a/core/sdk-cpp/proto/load_general_model_service.proto
+++ b/core/sdk-cpp/proto/load_general_model_service.proto
@@ -21,6 +21,7 @@ option cc_generic_services = true;
 message RequestAndResponse {
  required int32 a = 1;
  required float b = 2;
+  required uint64 log_id = 3 [ default = 0 ];
 };
 service LoadGeneralModelService {

--- a/doc/Offical_Docs/6-1_Cpp_Asynchronous_Framwork_CN.md
+++ b/doc/Offical_Docs/6-1_Cpp_Asynchronous_Framwork_CN.md
@@ -8,8 +8,8 @@
    - [开启同步模式](#2.1)
    - [开启异步模式](#2.2)
 - [性能测试](#3)
-    - [测试数据](#3.1)
+    - [测试结果](#3.1)
-    - [测试结论](#3.2)
+    - [测试数据](#3.2)
 <a name="1"></a>
@@ -75,7 +75,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --p
 **二.开启异步模式**
-启动命令使用 `--runtime_thread_num 4` 和  `--batch_infer_size 32` 开启异步模式，Serving 框架会启动8个异步线程，单次合并最大批量为32，自动开启动态 Padding。 
+启动命令使用 `--runtime_thread_num 2` 和  `--batch_infer_size 32` 开启异步模式，Serving 框架会启动2个异步线程，单次合并最大批量为32，自动开启动态 Padding。 
 ```
 python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --port 9292 --runtime_thread_num 4 --batch_infer_size 32 --ir_optim --gpu_multi_stream --gpu_ids 0
 ```
@@ -84,11 +84,12 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --p
 ## 性能测试
-GPU：Tesla P4 7611 MiB
-Cuda：cuda11.2-cudnn8-trt8
+- GPU：Tesla P4 7611 MiB
-Python：python3.7
+- CUDA：cuda11.2-cudnn8-trt8
-模型：ResNet_v2_50
+- Python 版本：python3.7
-测试数据：构造全1输入，单client请求100次，shape 范围(1, 224 ± 50, 224 ± 50)
+- 模型：ResNet_v2_50
+- 测试数据：构造全1输入，单client请求100次，shape 范围(1, 224 ± 50, 224 ± 50)
 同步模式启动命令:
 ```
@@ -102,7 +103,25 @@ python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --por
 <a name="3.1"></a>
-**一.测试数据**
+**一.测试结果**
+使用异步模式，并开启动态批量后，并发测试不同 shape 数据时，吞吐性能大幅提升。
+<div align=center>
+<img src='images/6-1_Cpp_Asynchronous_Framwork_CN_1.png' height = "600" align="middle"/>
+</div
+由于动态批量导致响应时长增长，经过测试，大多数场景下吞吐增量高于响应时长增长，尤其在高并发场景(client=70时)，在响应时长增长 33% 情况下，吞吐增加 105%。
+|Client |1 |5 |10 | 20 |30 |40 |50 |70 |
+|---|---|---|---|---|---|---|---|---|
+|QPS |-2.08% |-7.23% |-1.89% |20.55% |23.02% |23.34% |46.41% |105.27% |
+|响应时长 | 2.70% |7.09% |5.24% |13.34% |10.80% |43.60% |8.72% |33.89% |
+异步模式可有效提升服务吞吐性能。
+<a name="3.2"></a>
+**二.测试数据**
 1. 同步模式
@@ -147,20 +166,5 @@ python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --por
 |50 |1 |1.50 |50.60 |7578 |89.04 |121.545 |5000 |411.364 |331.118 |605.809 |874.543 |1285.650 |48.2343 |41.1369 |9350.0000 |2568777.6400 |295.8593|
 |70 |1 |3.80 |83.20 |7602 |89.59 |133.568 |7000 |524.073 |382.653 |799.463 |1202.179 |1576.809 |57.2885 |52.4077 |10761.0000 |3013600.9670 |315.2540|
-<a name="3.2"></a>
-**二.测试结论**
-使用异步模式，并开启动态批量后，并发测试不同 shape 数据时，吞吐性能大幅提升。
-<div align=center>
-<img src='images/6-1_Cpp_Asynchronous_Framwork_CN_1.png' height = "600" align="middle"/>
-</div
-由于动态批量导致响应时长增长，经过测试，大多数场景下吞吐增量高于响应时长增长，尤其在高并发场景(client=70时)，在响应时长增长 33% 情况下，吞吐增加 105%。
-|Client |1 |5 |10 | 20 |30 |40 |50 |70 |
-|---|---|---|---|---|---|---|---|---|
-|QPS |-2.08% |-7.23% |-1.89% |20.55% |23.02% |23.34% |46.41% |105.27% |
-|响应时长 | 2.70% |7.09% |5.24% |13.34% |10.80% |43.60% |8.72% |33.89% |
-异步模式可有效提升吞吐性能。
--- a/doc/Offical_Docs/6-2_Cpp_Serving_Protocols_CN.md
+++ b/doc/Offical_Docs/6-2_Cpp_Serving_Protocols_CN.md
@@ -33,7 +33,7 @@ C++ Serving 请求和应答的数据格式为 protobuf，重要的结构有以
 ## Tensor
-Tensor 可以装载多种类型的数据，是 Request 和 Response 的基础单元。Tensor 的定义如下：
+[Tensor](https://github.com/PaddlePaddle/Serving/blob/develop/core/general-server/proto/general_model_service.proto#L22) 可以装载多种类型的数据，是 Request 和 Response 的基础单元。Tensor 的定义如下：
 ```protobuf
 message Tensor {
@@ -104,7 +104,7 @@ Tensor 结构中重要成员 `elem_type`、`shape`、`lod` 和 `name/alias_name`
 - name/alias_name: 名称及别名，与模型配置对应
 - elem_type：数据类型，当前支持FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
 - shape：数据维度
- lod：边长结构 LoD(Level-of-Detail) Tensor 是 Paddle 的高级特性，是对 Tensor 的一种扩充，用于支持更自由的数据输入。详见[LOD](../LOD_CN.md)
+- lod：变长结构 LoD(Level-of-Detail) Tensor 是 Paddle 的高级特性，是对 Tensor 的一种扩充，用于支持更自由的数据输入。详见[LOD](../LOD_CN.md)
 |elem_type|类型|
 |---------|----|
@@ -146,7 +146,7 @@ tensor->set_alias_name(alias_name);
 // 拷贝数据
 int total_number = float_data.size();
 tensor->mutable_float_data()->Resize(total_number, 0);
-memcpy(tensor->mutable_float_data()->mutable_data(), float_datadata(), total_number * sizeof(float));
+memcpy(tensor->mutable_float_data()->mutable_data(), float_data.data(), total_number * sizeof(float));
 ```
 <a name="1.2"></a>
@@ -174,8 +174,7 @@ tensor->set_tensor_content(string_data);
 ## Request
+Request 为客户端需要发送的请求数据，其以 Tensor 为基础数据单元，并包含了额外的请求信息。定义如下：
-Request为客户端需要发送的请求数据，其以Tensor为基础数据单元，并包含了额外的请求信息。定义如下：
 ```protobuf
 message Request {
@@ -186,7 +185,7 @@ message Request {
 };
 ```
- fetch_vat_names: 需要获取的输出数据名称，在GeneralResponseOP会根据该列表进行过滤.请参考模型文件serving_client_conf.prototxt中的`fetch_var`字段下的`alias_name`。
+- fetch_vat_names: 需要获取的输出数据名称，在 `GeneralResponseOP` 会根据该列表进行过滤.请参考模型文件 `serving_client_conf.prototxt` 中的 `fetch_var` 字段下的 `alias_name`。
 - profile_server: 调试参数，打开时会输出性能信息
 - log_id: 请求ID
@@ -211,12 +210,14 @@ Tensor *tensor = req.add_tensor();
 **二.构建 Json Request**
-当使用 RESTful 请求时，可以使用 Json 格式数据，具体格式如下：
+当使用 RESTful 请求时，可以使用 Json 格式数据，示例如下：
 ```JSON
 {"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"name":"x","alias_name":"x","shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}
 ```
+可参考示例，不用修改整体结构，仅需修改数据类型和数据。
 <a name="3"></a>
 ## Response
@@ -242,8 +243,8 @@ message ModelOutput {
 Response 结构中核心成员：
 - profile_time：当设置 `request->set_profile_server(true)` 时，会返回性能信息
- err_no：错误码，详见 `core/predictor/common/constant.h`
+- err_no：错误码
- err_msg：错误信息，详见 `core/predictor/common/constant.h`
+- err_msg：错误信息
 - engine_name：输出节点名称
 |err_no|err_msg|

--- a/doc/Offical_Docs/6-5_Cpp_ABTest_CN.md
+++ b/doc/Offical_Docs/6-5_Cpp_ABTest_CN.md
@@ -38,7 +38,7 @@ Paddle Serving 的 ABTest 功能是基于 PYTHON SDK 和 多个服务端构成
 **一.安装 Paddle Serving Wheels**
-使用 ABTest 功能的前提是使用 PYTHON SDK，因此需要安装 `paddle_serving_client` 的 wheel 包。安装方法如下：
+使用 ABTest 功能的前提是使用 PYTHON SDK，因此需要安装 `paddle_serving_client` 的 wheel 包。[安装方法](./2-1_Docker_Images_CN.md) 如下：
 ```
 pip3 install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
@@ -48,7 +48,7 @@ pip3 install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/s
 **二.下载多个模型并保存模型参数**
-本示例已提供了一键下载脚本 `sh get_data.sh`，下载 `bow`、`cnn`和`lstm` 3种不同方式训练的模型。 
+本示例已提供了一键下载脚本 `sh get_data.sh`，下载自训练的模型 `bow`、`cnn`和`lstm` 3种不同方式训练的模型。 
 ```
 sh get_data.sh

--- a/doc/Offical_Docs/7-4_Python_Pipeline_Benchmark_CN.md
+++ b/doc/Offical_Docs/7-4_Python_Pipeline_Benchmark_CN.md
 # Python Pipeline 性能测试
 - [测试环境](#1)
- [性能指标](#2)
+- [性能指标与结论](#2)
 <a name="1"></a>
@@ -18,7 +18,12 @@
 <a name="2"></a>
-## 性能指标 
+## 性能指标与结论
+通过测试，使用 Python Pipeline 模式通过多进程并发，充分利用 GPU 显卡，具有较好的吞吐性能。
+测试数据如下：
 |model_name |thread_num |batch_size |CPU_util(%) |GPU_memory(mb) |GPU_util(%) |qps(samples/s) |total count |mean(ms) |median(ms) |80 percent(ms) |90 percent(ms) |99 percent(ms) |total cost(s) |each cost(s)|
 |:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--

--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
@@ -121,7 +121,7 @@ class Server(object):
        self.trt_dynamic_shape_info = []
        self.gpu_memory_mb = 50
        self.cpu_math_thread_num = 1
-        self.trt_workspace_size = 33554432  # 1 << 25
+        self.trt_workspace_size = 33554432 # 1 << 25
        self.trt_use_static = False
    def get_fetch_list(self, infer_node_idx=-1):
@@ -358,10 +358,10 @@ class Server(object):
            engine.use_xpu = self.use_xpu
            engine.use_ascend_cl = self.use_ascend_cl
            engine.use_gpu = False
-            engine.gpu_memory_mb = self.gpu_memory_mb
+            #engine.gpu_memory_mb = self.gpu_memory_mb
-            engine.cpu_math_thread_num = self.cpu_math_thread_num
+            #engine.cpu_math_thread_num = self.cpu_math_thread_num
-            engine.trt_workspace_size = self.trt_workspace_size
+            #engine.trt_workspace_size = self.trt_workspace_size
-            engine.trt_use_static = self.trt_use_static
+            #engine.trt_use_static = self.trt_use_static
            # use distributed model.
            if self.dist_subgraph_index >= 0:

--- a/tools/cpp_examples/demo-serving/proto/general_model_service.proto
+++ b/tools/cpp_examples/demo-serving/proto/general_model_service.proto
@@ -42,6 +42,13 @@ message Request {
 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  bool profile_server = 3;
+  uint64 log_id = 4;
+  // Error code
+  int32 err_no = 5;
+  // Error messages
+  string err_msg = 6;
 };
 message ModelOutput {