From 9a2ffb15855a93acad0a886c879ac347087e27e4 Mon Sep 17 00:00:00 2001 From: TeslaZhao Date: Tue, 17 May 2022 15:30:07 +0800 Subject: [PATCH] Update doc --- cmake/paddlepaddle.cmake | 20 +++---- .../proto/general_model_service.proto | 1 - core/configure/proto/server_configure.proto | 17 +++--- .../proto/general_model_service.proto | 2 +- .../sdk-cpp/proto/general_model_service.proto | 2 +- .../proto/load_general_model_service.proto | 1 + .../6-1_Cpp_Asynchronous_Framwork_CN.md | 52 ++++++++++--------- .../6-2_Cpp_Serving_Protocols_CN.md | 19 +++---- doc/Offical_Docs/6-5_Cpp_ABTest_CN.md | 4 +- .../7-4_Python_Pipeline_Benchmark_CN.md | 9 +++- python/paddle_serving_server/server.py | 10 ++-- .../proto/general_model_service.proto | 7 +++ 12 files changed, 79 insertions(+), 65 deletions(-) diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake index b3481462..2a855b88 100644 --- a/cmake/paddlepaddle.cmake +++ b/cmake/paddlepaddle.cmake @@ -30,7 +30,7 @@ message( "WITH_GPU = ${WITH_GPU}") # Paddle Version should be one of: # latest: latest develop build # version number like 1.5.2 -SET(PADDLE_VERSION "2.3.0") +SET(PADDLE_VERSION "2.3.0-no-ort") if (WITH_GPU) message("CUDA: ${CUDA_VERSION}, CUDNN_MAJOR_VERSION: ${CUDNN_MAJOR_VERSION}") # cuda 11.0 is not supported, 11.2 would be added. @@ -171,25 +171,25 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mklml/lib) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib") LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib") -LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib) +#SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib") +#LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib") -LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib) +#SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib") +#LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib) if (NOT WITH_MKLML) ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a) endif() -ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib/libpaddle2onnx.so) +#ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL) +#SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib/libpaddle2onnx.so) -ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib/libonnxruntime.so.1.10.0) +#ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL) +#SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib/libonnxruntime.so.1.10.0) ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so) +SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a) if (WITH_ASCEND_CL) SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so) endif() diff --git a/core/configure/proto/general_model_service.proto b/core/configure/proto/general_model_service.proto index f3da98a9..b4f1ce0c 100644 --- a/core/configure/proto/general_model_service.proto +++ b/core/configure/proto/general_model_service.proto @@ -92,7 +92,6 @@ message Response { repeated int64 profile_time = 2; bool profile_server = 3; uint64 log_id = 4; - // Error code int32 err_no = 5; // Error messages diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index f5edd23f..4f49aa3c 100644 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -51,17 +51,14 @@ message EngineDesc { /* * "gpu_memory_mb": allocate gpu memory by config.EnableUseGpu() - * "cpu_math_thread_num": set thread numbers of cpu math by - * config.SetCpuMathLibraryNumThreads() - * "trt_workspace_size": set TensorRT workspace size by - * config.EnableTensorRtEngine(), 1 << 25 default - * "trt_use_static": If true, save the optimization information of the TRT - * serialized to the disk, and load from the disk. + * "cpu_math_thread_num": set thread numbers of cpu math by config.SetCpuMathLibraryNumThreads() + * "trt_workspace_size": set TensorRT workspace size by config.EnableTensorRtEngine(), 1 << 25 default + * "trt_use_static": If true, save the optimization information of the TRT serialized to the disk, and load from the disk. */ - optional int32 gpu_memory_mb = 22 [ default = 100 ]; - optional int32 cpu_math_thread_num = 23 [ default = 1 ]; - optional int32 trt_workspace_size = 24 [ default = 33554432 ]; - optional bool trt_use_static = 25 [ default = false ]; + optional int32 gpu_memory_mb = 22 [default = 100]; + optional int32 cpu_math_thread_num = 23 [default = 1]; + optional int32 trt_workspace_size = 24 [default = 33554432]; + optional bool trt_use_static = 25 [default = false]; /* * "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling diff --git a/core/general-server/proto/general_model_service.proto b/core/general-server/proto/general_model_service.proto index 904ffb97..a5adeeb9 100755 --- a/core/general-server/proto/general_model_service.proto +++ b/core/general-server/proto/general_model_service.proto @@ -94,9 +94,9 @@ message Response { repeated int64 profile_time = 2; bool profile_server = 3; uint64 log_id = 4; + // Error code int32 err_no = 5; - // Error messages string err_msg = 6; }; diff --git a/core/sdk-cpp/proto/general_model_service.proto b/core/sdk-cpp/proto/general_model_service.proto index de951625..5c17f955 100755 --- a/core/sdk-cpp/proto/general_model_service.proto +++ b/core/sdk-cpp/proto/general_model_service.proto @@ -94,9 +94,9 @@ message Response { repeated int64 profile_time = 2; bool profile_server = 3; uint64 log_id = 4; + // Error code int32 err_no = 5; - // Error messages string err_msg = 6; }; diff --git a/core/sdk-cpp/proto/load_general_model_service.proto b/core/sdk-cpp/proto/load_general_model_service.proto index c58f79ec..da731589 100644 --- a/core/sdk-cpp/proto/load_general_model_service.proto +++ b/core/sdk-cpp/proto/load_general_model_service.proto @@ -21,6 +21,7 @@ option cc_generic_services = true; message RequestAndResponse { required int32 a = 1; required float b = 2; + required uint64 log_id = 3 [ default = 0 ]; }; service LoadGeneralModelService { diff --git a/doc/Offical_Docs/6-1_Cpp_Asynchronous_Framwork_CN.md b/doc/Offical_Docs/6-1_Cpp_Asynchronous_Framwork_CN.md index ed7daf54..37c2f2af 100644 --- a/doc/Offical_Docs/6-1_Cpp_Asynchronous_Framwork_CN.md +++ b/doc/Offical_Docs/6-1_Cpp_Asynchronous_Framwork_CN.md @@ -8,8 +8,8 @@ - [开启同步模式](#2.1) - [开启异步模式](#2.2) - [性能测试](#3) - - [测试数据](#3.1) - - [测试结论](#3.2) + - [测试结果](#3.1) + - [测试数据](#3.2) @@ -75,7 +75,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --p **二.开启异步模式** -启动命令使用 `--runtime_thread_num 4` 和 `--batch_infer_size 32` 开启异步模式,Serving 框架会启动8个异步线程,单次合并最大批量为32,自动开启动态 Padding。 +启动命令使用 `--runtime_thread_num 2` 和 `--batch_infer_size 32` 开启异步模式,Serving 框架会启动2个异步线程,单次合并最大批量为32,自动开启动态 Padding。 ``` python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --port 9292 --runtime_thread_num 4 --batch_infer_size 32 --ir_optim --gpu_multi_stream --gpu_ids 0 ``` @@ -84,11 +84,12 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --p ## 性能测试 -GPU:Tesla P4 7611 MiB -Cuda:cuda11.2-cudnn8-trt8 -Python:python3.7 -模型:ResNet_v2_50 -测试数据:构造全1输入,单client请求100次,shape 范围(1, 224 ± 50, 224 ± 50) + +- GPU:Tesla P4 7611 MiB +- CUDA:cuda11.2-cudnn8-trt8 +- Python 版本:python3.7 +- 模型:ResNet_v2_50 +- 测试数据:构造全1输入,单client请求100次,shape 范围(1, 224 ± 50, 224 ± 50) 同步模式启动命令: ``` @@ -102,7 +103,25 @@ python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --por -**一.测试数据** +**一.测试结果** + +使用异步模式,并开启动态批量后,并发测试不同 shape 数据时,吞吐性能大幅提升。 +
+ +
+ +**二.测试数据** 1. 同步模式 @@ -147,20 +166,5 @@ python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --por |50 |1 |1.50 |50.60 |7578 |89.04 |121.545 |5000 |411.364 |331.118 |605.809 |874.543 |1285.650 |48.2343 |41.1369 |9350.0000 |2568777.6400 |295.8593| |70 |1 |3.80 |83.20 |7602 |89.59 |133.568 |7000 |524.073 |382.653 |799.463 |1202.179 |1576.809 |57.2885 |52.4077 |10761.0000 |3013600.9670 |315.2540| - - -**二.测试结论** -使用异步模式,并开启动态批量后,并发测试不同 shape 数据时,吞吐性能大幅提升。 -
- -
set_alias_name(alias_name); // 拷贝数据 int total_number = float_data.size(); tensor->mutable_float_data()->Resize(total_number, 0); -memcpy(tensor->mutable_float_data()->mutable_data(), float_datadata(), total_number * sizeof(float)); +memcpy(tensor->mutable_float_data()->mutable_data(), float_data.data(), total_number * sizeof(float)); ``` @@ -174,8 +174,7 @@ tensor->set_tensor_content(string_data); ## Request - -Request为客户端需要发送的请求数据,其以Tensor为基础数据单元,并包含了额外的请求信息。定义如下: +Request 为客户端需要发送的请求数据,其以 Tensor 为基础数据单元,并包含了额外的请求信息。定义如下: ```protobuf message Request { @@ -186,7 +185,7 @@ message Request { }; ``` -- fetch_vat_names: 需要获取的输出数据名称,在GeneralResponseOP会根据该列表进行过滤.请参考模型文件serving_client_conf.prototxt中的`fetch_var`字段下的`alias_name`。 +- fetch_vat_names: 需要获取的输出数据名称,在 `GeneralResponseOP` 会根据该列表进行过滤.请参考模型文件 `serving_client_conf.prototxt` 中的 `fetch_var` 字段下的 `alias_name`。 - profile_server: 调试参数,打开时会输出性能信息 - log_id: 请求ID @@ -211,12 +210,14 @@ Tensor *tensor = req.add_tensor(); **二.构建 Json Request** -当使用 RESTful 请求时,可以使用 Json 格式数据,具体格式如下: +当使用 RESTful 请求时,可以使用 Json 格式数据,示例如下: ```JSON {"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"name":"x","alias_name":"x","shape":[1,13]}],"fetch_var_names":["price"],"log_id":0} ``` +可参考示例,不用修改整体结构,仅需修改数据类型和数据。 + ## Response @@ -242,8 +243,8 @@ message ModelOutput { Response 结构中核心成员: - profile_time:当设置 `request->set_profile_server(true)` 时,会返回性能信息 -- err_no:错误码,详见 `core/predictor/common/constant.h` -- err_msg:错误信息,详见 `core/predictor/common/constant.h` +- err_no:错误码 +- err_msg:错误信息 - engine_name:输出节点名称 |err_no|err_msg| diff --git a/doc/Offical_Docs/6-5_Cpp_ABTest_CN.md b/doc/Offical_Docs/6-5_Cpp_ABTest_CN.md index e6ff4adb..ff5c7145 100644 --- a/doc/Offical_Docs/6-5_Cpp_ABTest_CN.md +++ b/doc/Offical_Docs/6-5_Cpp_ABTest_CN.md @@ -38,7 +38,7 @@ Paddle Serving 的 ABTest 功能是基于 PYTHON SDK 和 多个服务端构成 **一.安装 Paddle Serving Wheels** -使用 ABTest 功能的前提是使用 PYTHON SDK,因此需要安装 `paddle_serving_client` 的 wheel 包。安装方法如下: +使用 ABTest 功能的前提是使用 PYTHON SDK,因此需要安装 `paddle_serving_client` 的 wheel 包。[安装方法](./2-1_Docker_Images_CN.md) 如下: ``` pip3 install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple @@ -48,7 +48,7 @@ pip3 install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/s **二.下载多个模型并保存模型参数** -本示例已提供了一键下载脚本 `sh get_data.sh`,下载 `bow`、`cnn`和`lstm` 3种不同方式训练的模型。 +本示例已提供了一键下载脚本 `sh get_data.sh`,下载自训练的模型 `bow`、`cnn`和`lstm` 3种不同方式训练的模型。 ``` sh get_data.sh diff --git a/doc/Offical_Docs/7-4_Python_Pipeline_Benchmark_CN.md b/doc/Offical_Docs/7-4_Python_Pipeline_Benchmark_CN.md index 2dce5d19..9d29abf8 100644 --- a/doc/Offical_Docs/7-4_Python_Pipeline_Benchmark_CN.md +++ b/doc/Offical_Docs/7-4_Python_Pipeline_Benchmark_CN.md @@ -1,7 +1,7 @@ # Python Pipeline 性能测试 - [测试环境](#1) -- [性能指标](#2) +- [性能指标与结论](#2) @@ -18,7 +18,12 @@ -## 性能指标 +## 性能指标与结论 + +通过测试,使用 Python Pipeline 模式通过多进程并发,充分利用 GPU 显卡,具有较好的吞吐性能。 + + +测试数据如下: |model_name |thread_num |batch_size |CPU_util(%) |GPU_memory(mb) |GPU_util(%) |qps(samples/s) |total count |mean(ms) |median(ms) |80 percent(ms) |90 percent(ms) |99 percent(ms) |total cost(s) |each cost(s)| |:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:-- diff --git a/python/paddle_serving_server/server.py b/python/paddle_serving_server/server.py index c0090b75..4c05b43f 100755 --- a/python/paddle_serving_server/server.py +++ b/python/paddle_serving_server/server.py @@ -121,7 +121,7 @@ class Server(object): self.trt_dynamic_shape_info = [] self.gpu_memory_mb = 50 self.cpu_math_thread_num = 1 - self.trt_workspace_size = 33554432 # 1 << 25 + self.trt_workspace_size = 33554432 # 1 << 25 self.trt_use_static = False def get_fetch_list(self, infer_node_idx=-1): @@ -358,10 +358,10 @@ class Server(object): engine.use_xpu = self.use_xpu engine.use_ascend_cl = self.use_ascend_cl engine.use_gpu = False - engine.gpu_memory_mb = self.gpu_memory_mb - engine.cpu_math_thread_num = self.cpu_math_thread_num - engine.trt_workspace_size = self.trt_workspace_size - engine.trt_use_static = self.trt_use_static + #engine.gpu_memory_mb = self.gpu_memory_mb + #engine.cpu_math_thread_num = self.cpu_math_thread_num + #engine.trt_workspace_size = self.trt_workspace_size + #engine.trt_use_static = self.trt_use_static # use distributed model. if self.dist_subgraph_index >= 0: diff --git a/tools/cpp_examples/demo-serving/proto/general_model_service.proto b/tools/cpp_examples/demo-serving/proto/general_model_service.proto index 8fedb60e..3a1cba2c 100755 --- a/tools/cpp_examples/demo-serving/proto/general_model_service.proto +++ b/tools/cpp_examples/demo-serving/proto/general_model_service.proto @@ -42,6 +42,13 @@ message Request { message Response { repeated ModelOutput outputs = 1; repeated int64 profile_time = 2; + bool profile_server = 3; + uint64 log_id = 4; + + // Error code + int32 err_no = 5; + // Error messages + string err_msg = 6; }; message ModelOutput { -- GitLab