未验证 提交 8209fb4d 编写于 作者: S ShiningZhang 提交者: GitHub

Merge branch 'develop' into dev-doc1

......@@ -43,7 +43,7 @@ The goal of Paddle Serving is to provide high-performance, flexible and easy-to-
<h2 align="center">Tutorial and Papers</h2>
- AIStudio tutorial(Chinese) : [Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/2538249)
- AIStudio tutorial(Chinese) : [Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/3946013)
- AIStudio OCR practice(Chinese) : [基于PaddleServing的OCR服务化部署实战](https://aistudio.baidu.com/aistudio/projectdetail/3630726)
- Video tutorial(Chinese) : [深度学习服务化部署-以互联网应用为例](https://aistudio.baidu.com/aistudio/course/introduce/19084)
- Edge AI solution(Chinese) : [基于Paddle Serving&百度智能边缘BIE的边缘AI解决方案](https://mp.weixin.qq.com/s/j0EVlQXaZ7qmoz9Fv96Yrw)
......
......@@ -41,7 +41,7 @@ Paddle Serving依托深度学习框架PaddlePaddle旨在帮助深度学习开发
<h2 align="center">教程与论文</h2>
- AIStudio 使用教程 : [Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/2538249)
- AIStudio 使用教程 : [Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/3946013)
- AIStudio OCR实战 : [基于PaddleServing的OCR服务化部署实战](https://aistudio.baidu.com/aistudio/projectdetail/3630726)
- 视频教程 : [深度学习服务化部署-以互联网应用为例](https://aistudio.baidu.com/aistudio/course/introduce/19084)
- 边缘AI 解决方案 : [基于Paddle Serving&百度智能边缘BIE的边缘AI解决方案](https://mp.weixin.qq.com/s/j0EVlQXaZ7qmoz9Fv96Yrw)
......
......@@ -30,7 +30,7 @@ message( "WITH_GPU = ${WITH_GPU}")
# Paddle Version should be one of:
# latest: latest develop build
# version number like 1.5.2
SET(PADDLE_VERSION "2.3.0-rc0")
SET(PADDLE_VERSION "2.3.0")
if (WITH_GPU)
message("CUDA: ${CUDA_VERSION}, CUDNN_MAJOR_VERSION: ${CUDNN_MAJOR_VERSION}")
# cuda 11.0 is not supported, 11.2 would be added.
......@@ -53,6 +53,7 @@ else()
set(WITH_TRT OFF)
endif()
if (WITH_GPU)
SET(PADDLE_VERSION "2.3.0-no-ort")
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/GPU/${CUDA_SUFFIX}")
elseif (WITH_LITE)
message("cpu arch: ${CMAKE_SYSTEM_PROCESSOR}")
......@@ -85,6 +86,7 @@ elseif (WITH_ASCEND_CL)
endif()
else()
if (WITH_AVX)
SET(PADDLE_VERSION "2.3.0-no-ort")
if (WITH_MKLML)
SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_avx_mkl")
else()
......@@ -171,14 +173,27 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mklml/lib)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib")
LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
#SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib")
#LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib)
#SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib")
#LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib)
if (NOT WITH_MKLML)
ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)
endif()
#ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
#SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/paddle2onnx/lib/libpaddle2onnx.so)
#ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
#SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/onnxruntime/lib/libonnxruntime.so.1.10.0)
ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so)
if (WITH_ASCEND_CL)
SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a)
if (WITH_ASCEND_CL OR WITH_XPU)
SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.so)
endif()
......
......@@ -90,11 +90,12 @@ message Request {
message Response {
repeated ModelOutput outputs = 1;
repeated int64 profile_time = 2;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 3;
int32 err_no = 5;
// Error messages
string err_msg = 4;
string err_msg = 6;
};
message ModelOutput {
......
......@@ -49,6 +49,17 @@ message EngineDesc {
optional bool gpu_multi_stream = 20;
optional bool use_ascend_cl = 21;
/*
* "gpu_memory_mb": allocate gpu memory by config.EnableUseGpu()
* "cpu_math_thread_num": set thread numbers of cpu math by config.SetCpuMathLibraryNumThreads()
* "trt_workspace_size": set TensorRT workspace size by config.EnableTensorRtEngine(), 1 << 25 default
* "trt_use_static": If true, save the optimization information of the TRT serialized to the disk, and load from the disk.
*/
optional int32 gpu_memory_mb = 22 [default = 100];
optional int32 cpu_math_thread_num = 23 [default = 1];
optional int32 trt_workspace_size = 24 [default = 33554432];
optional bool trt_use_static = 25 [default = false];
/*
* "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling
* mode.
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "core/general-server/op/general_remote_op.h"
#include <iostream>
#include <sstream>
#include "core/util/include/timer.h"
// paddle inference 2.1 support: FLOAT32, INT64, INT32, UINT8, INT8
// will support: FLOAT16
#define BRPC_MAX_BODY_SIZE 2 * 1024 * 1024 * 1024
const std::string LODABALANCE = "";
namespace baidu {
namespace paddle_serving {
namespace serving {
using baidu::paddle_serving::Timer;
using baidu::paddle_serving::predictor::general_model::Tensor;
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::Response;
brpc::Channel BRPCStub::brpc_channels[MAX_MP_NUM];
brpc::ChannelOptions BRPCStub::options;
std::atomic<int> BRPCStub::inited(0);
int GeneralRemoteOp::inference() {
LOG(INFO) << "Enter GeneralRemoteOp:inference()";
int expected = 0;
std::vector<std::string> op_address = address();
if (BRPCStub::inited.compare_exchange_strong(expected, 1)) {
BRPCStub::options.protocol = "baidu_std";
BRPCStub::options.connection_type = "short";
BRPCStub::options.timeout_ms = 80000 /*milliseconds*/;
BRPCStub::options.max_retry = 100;
brpc::fLU64::FLAGS_max_body_size = BRPC_MAX_BODY_SIZE;
LOG(ERROR) << "address size: " << op_address.size();
for (int i = 0; i < op_address.size(); ++i) {
LOG(INFO) << i + 1 << " address is " << op_address[i].c_str();
BRPCStub::brpc_channels[i].Init(
op_address[i].c_str(), LODABALANCE.c_str(), &BRPCStub::options);
}
BRPCStub::inited++;
}
while (BRPCStub::inited < 2) {
}
Timer timeline;
int64_t start = timeline.TimeStampUS();
timeline.Start();
VLOG(2) << "Going to run Remote inference";
Request* req = (Request*)(get_request_message());
Response* res = mutable_data<Response>();
uint64_t log_id = req->log_id();
brpc::Controller brpc_controllers[MAX_MP_NUM];
brpc::CallId brpc_callids[MAX_MP_NUM];
Response brpc_response_tmp;
size_t i = 0;
// Init BRPC controllers, callids and stubs
for (i = 0; i < op_address.size(); ++i) {
brpc_controllers[i].set_log_id(log_id);
brpc_callids[i] = brpc_controllers[i].call_id();
}
for (i = 0; i < op_address.size(); ++i) {
baidu::paddle_serving::predictor::general_model::GeneralModelService_Stub
stub(&BRPCStub::brpc_channels[i]);
LOG(INFO) << "Sended 1 request to Slave Sever " << i;
if (0 == i) {
stub.inference(&brpc_controllers[i], req, res, brpc::DoNothing());
continue;
}
stub.inference(
&brpc_controllers[i], req, &brpc_response_tmp, brpc::DoNothing());
}
LOG(INFO) << "All request are sended, waiting for all responses.";
// Wait RPC done.
for (i = 0; i < op_address.size(); ++i) {
brpc::Join(brpc_callids[i]);
}
// Print RPC Results
for (i = 0; i < op_address.size(); ++i) {
LOG(INFO) << "brpc_controller_" << i
<< " status:" << brpc_controllers[i].Failed();
if (!brpc_controllers[i].Failed()) {
LOG(INFO) << "Received response from "
<< brpc_controllers[i].remote_side()
<< " Latency=" << brpc_controllers[i].latency_us() << "us";
} else {
LOG(ERROR) << brpc_controllers[i].ErrorText();
}
}
LOG(INFO) << "All brpc remote stubs joined done.";
res->set_log_id(log_id);
res->set_profile_server(req->profile_server());
int64_t end = timeline.TimeStampUS();
res->add_profile_time(start);
res->add_profile_time(end);
return 0;
}
DEFINE_OP(GeneralRemoteOp);
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <brpc/channel.h>
#include <butil/logging.h>
#include <butil/time.h>
#include <gflags/gflags.h>
#include <atomic>
#include <memory>
#include <string>
#include <vector>
#include "core/general-server/general_model_service.pb.h"
#include "core/sdk-cpp/builtin_format.pb.h"
#include "core/sdk-cpp/general_model_service.pb.h"
#include "core/sdk-cpp/include/common.h"
#include "core/sdk-cpp/include/predictor_sdk.h"
#define MAX_MP_NUM 16
namespace baidu {
namespace paddle_serving {
namespace serving {
using baidu::paddle_serving::predictor::general_model::Request;
using baidu::paddle_serving::predictor::general_model::Response;
class GeneralRemoteOp
: public baidu::paddle_serving::predictor::OpWithChannel<
baidu::paddle_serving::predictor::general_model::Response> {
public:
DECLARE_OP(GeneralRemoteOp);
int inference();
};
class BRPCStub {
public:
static brpc::Channel brpc_channels[MAX_MP_NUM];
static brpc::ChannelOptions options;
static std::atomic<int> inited;
};
} // namespace serving
} // namespace paddle_serving
} // namespace baidu
......@@ -94,9 +94,9 @@ message Response {
repeated int64 profile_time = 2;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 5;
// Error messages
string err_msg = 6;
};
......
......@@ -20,7 +20,7 @@ namespace predictor {
DEFINE_bool(use_parallel_infer_service, false, "");
DEFINE_int32(el_log_level, 16, "");
DEFINE_int32(idle_timeout_s, 16, "");
DEFINE_int32(idle_timeout_s, 80, "");
DEFINE_int32(port, 8010, "");
DEFINE_string(workflow_path, "./conf", "");
DEFINE_string(workflow_file, "workflow.prototxt", "");
......
......@@ -341,7 +341,7 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
LOG(INFO) << "Hit auto padding, merge " << padding_task_count
<< " tasks into 1 batch.";
}
LOG(INFO) << "Number of tasks remaining in _task_queue is"
LOG(INFO) << "Number of tasks remaining in _task_queue is "
<< _task_queue.size();
return true;
}
......
......@@ -94,9 +94,9 @@ message Response {
repeated int64 profile_time = 2;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 5;
// Error messages
string err_msg = 6;
};
......
......@@ -21,6 +21,7 @@ option cc_generic_services = true;
message RequestAndResponse {
required int32 a = 1;
required float b = 2;
required uint64 log_id = 3 [ default = 0 ];
};
service LoadGeneralModelService {
......
# 开发者贡献
- [贡献代码流程](#1)
- [创建个人仓库](#1.1)
- [本地克隆仓库和分支](#1.2)
- [提交代码](#1.3)
- [通过 CI 验证](#1.4)
- [Code Review](#1.5)
- [代码合入](#1.6)
- [致谢开发者](#2)
<a name="1"></a>
## 贡献代码流程
Paddle Serving 使用 Git 分支模式。通常,按以下步骤贡献代码:
<a name="1.1"></a>
**一.创建个人仓库**
Paddle Serving 社区一直在快速发展,每个人都写到官方回购中是没有意义的。所以,请先 `fork` 出个人仓库,并提交 `Pull Requests``fork` 个人仓库,只需前往 [Serving](https://github.com/PaddlePaddle/Serving) 页面并单击右上角 ["Fork"](https://github.com/PaddlePaddle/Serving/fork)
<a name="1.2"></a>
**二.本地克隆仓库和分支**
创建个人仓库后,`clone` 个人仓库到本地计算机,默认创建本地 `develop` 分支。
```bash
git clone https://github.com/your-github-account/Serving
```
<a name="1.3"></a>
**三.提交代码**
本地修改代码并验证后,准备提交代码。在提交代码前请安装 [`pre-commit`](http://pre-commit.com/)、cpplint 和 pylint。
```bash
pip3 install pre-commit
pre-commit install
pip3 install cpplint pylint
```
在提交代码时,会进行代码格式检查和修正,待所有检查都通过后,方可提交。
```shell
$ git commit
CRLF end-lines remover...............................(no files to check)Skipped
yapf.....................................................................Passed
Check for added large files..............................................Passed
Check for merge conflicts................................................Passed
Check for broken symlinks................................................Passed
Detect Private Key...................................(no files to check)Skipped
Fix End of Files.........................................................Passed
clang-format.............................................................Passed
cpplint..................................................................Passed
pylint...................................................................Passed
copyright_checker........................................................Passed
[my-cool-stuff c703c041] add test file
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 233
```
运行代码提交命令,提交到个人仓库,再通过 Github 页面创建一个 `pull request` 提交到 Paddel Serving 主仓库。
```bash
git push origin develop
```
<a name="1.4"></a>
**四.通过 CI 验证**
所有提交到 Paddle Serving 主仓库的 `pull request` 都会运行 `py36``py38``py39`的所有 CI 测试用例。全部通过后才能合入。
<a name="1.5"></a>
**五.Code Review**
所有提交的代码要经过管理员的评审,至少通过2人评审后方可合入。
<a name="1.6"></a>
**六.代码合入**
待通过全部 CI 验证,并且完成 Code Review 和修改后,由仓库管理员合入代码。
<a name="2"></a>
## 致谢开发者
- 感谢 [@loveululu](https://github.com/loveululu) 提供 Cube python API
- 感谢 [@EtachGu](https://github.com/EtachGu) 更新 docker 使用命令
- 感谢 [@BeyondYourself](https://github.com/BeyondYourself) 提供grpc教程,更新FAQ教程,整理文件目录。
- 感谢 [@mcl-stone](https://github.com/mcl-stone) 提供faster rcnn benchmark脚本
- 感谢 [@cg82616424](https://github.com/cg82616424) 提供unet benchmark脚本和修改部分注释错误
- 感谢 [@cuicheng01](https://github.com/cuicheng01) 提供PaddleClas的11个模型
- 感谢 [@Jiaqi Liu](https://github.com/LiuChiachi) 新增list[str]类型输入的预测支持
- 感谢 [@Bin Lu](https://github.com/Intsigstephon) 提供PP-Shitu C++模型示例
# 常见问题与解答
常见问题解答分为8大类问题:
- [版本升级问题](#1)
- [基础知识](#2)
- [安装问题](#3)
- [编译问题](#4)
- [环境问题](#5)
- [部署问题](#6)
- [预测问题](#7)
- [日志排查](#8)
<a name="1"></a>
## 版本升级问题
#### Q: 从 `v0.6.x` 升级到 `v0.7.0` 版本时,运行 Python Pipeline 程序时报错信息如下:
```
Failed to predict: (data_id=1 log_id=0) [det|0] Failed to postprocess: postprocess() takes 4 positional arguments but 5 were given
```
**A:** 在服务端程序(例如 web_service.py)的postprocess函数定义中增加参数data_id,改为 def postprocess(self, input_dicts, fetch_dict, **data_id**, log_id) 即可。
<a name="2"></a>
## 基础知识
#### Q: Paddle Serving 、Paddle Inference、PaddleHub Serving 三者的区别及联系?
**A:** Paddle Serving 是远程服务,即发起预测的设备(手机、浏览器、客户端等)与实际预测的硬件不在一起。 paddle inference 是一个 library,适合嵌入到一个大系统中保证预测效率,Paddle Serving 调用 paddle inference 做远程服务。paddlehub serving 可以认为是一个示例,都会使用 Paddle Serving 作为统一预测服务入口。如果在 web 端交互,一般是调用远程服务的形式,可以使用 Paddle Serving 的 web service 搭建。
#### Q: Paddle Serving 支持哪些数据类型?
**A:** 在 protobuf 定义中 `feed_type``fetch_type` 编号与数据类型对应如下,完整信息可参考[保存用于 Serving 部署的模型参数](./5-1_Save_Model_Params_CN.md)
| 类型 | 类型值 |
|------|------|
| int64 | 0 |
| float32 |1 |
| int32 | 2 |
| float64 | 3 |
| int16 | 4 |
| float16 | 5 |
| bfloat16 | 6 |
| uint8 | 7 |
| int8 | 8 |
| bool | 9 |
| complex64 | 10
| complex128 | 11 |
#### Q: Paddle Serving 是否支持 Windows 和 Linux 原生环境部署?
**A:** 安装 `Linux Docker`,在 Docker 中部署 Paddle Serving,参考[安装指南](./2-0_Index_CN.md)
#### Q: Paddle Serving 如何修改消息大小限制
**A:** Server 和 Client 通过修改 `FLAGS_max_body_size` 参数来扩大数据量限制,单位为字节,默认为64MB
#### Q: Paddle Serving 客户端目前支持哪些开发语言?
**A:** 提供 Python、C++ 和 Java SDK
#### Q: Paddle Serving 支持哪些网络协议?
**A:** C++ Serving 同时支持 HTTP、gRPC 和 bRPC 协议。其中 HTTP 协议既支持 HTTP + Json 格式,同时支持 HTTP + proto 格式。完整信息请阅读[C++ Serving 通讯协议](./6-2_Cpp_Serving_Protocols_CN.md);Python Pipeline 支持 HTTP 和 gRPC 协议,更多信息请阅读[Python Pipeline 框架设计](./6-2_Cpp_Serving_Protocols_CN.md)
<a name="3"></a>
## 安装问题
#### Q: `pip install` 安装 `python wheel` 过程中,报错信息如何修复?
```
Collecting opencv-python
Getting requirements to build wheel ... error
ERROR: Command errored out with exit status 1:
command: /home/work/Python-2.7.17/build/bin/python /home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py get_requires_for_build_wheel /tmp/tmpLiweA9
cwd: /tmp/pip-install-_w6AUI/opencv-python
Complete output (22 lines):
Traceback (most recent call last):
File "setup.py", line 99, in main
% {"ext": re.escape(sysconfig.get_config_var("EXT_SUFFIX"))}
File "/home/work/Python-2.7.17/build/lib/python2.7/re.py", line 210, in escape
s = list(pattern)
TypeError: 'NoneType' object is not iterable
```
**A:** 指定 `opencv-python` 安装版本4.2.0.32,运行 `pip3 install opencv-python==4.2.0.32`
#### Q: pip3 install wheel包过程报错,详细信息如下:
```
Complete output from command python setup.py egg_info:
Found cython-generated files...
error in grpcio setup command: 'install_requires' must be a string or list of strings containing valid project/version requirement specifiers; Expected ',' or end-of-list in futures>=2.2.0; python_version<'3.2' at ; python_version<'3.2'
----------------------------------------
Command "python setup.py egg_info" failed with error code 1 in /tmp/pip-install-taoxz02y/grpcio/
```
**A:** 需要升级 pip3 版本,再重新执行安装命令。
```
pip3 install --upgrade pip
pip3 install --upgrade setuptools
```
#### Q: 运行过程中出现 `No module named xxx` 错误,信息如下:
```
Traceback (most recent call last):
File "../../deploy/serving/test_client.py", line 18, in <module>
from paddle_serving_app.reader import *
File "/usr/local/python2.7.15/lib/python2.7/site-packages/paddle_serving_app/reader/__init__.py", line 15, in <module>
from .image_reader import ImageReader, File2Image, URL2Image, Sequential, Normalize, Base64ToImage
File "/usr/local/python2.7.15/lib/python2.7/site-packages/paddle_serving_app/reader/image_reader.py", line 24, in <module>
from shapely.geometry import Polygon
ImportError: No module named shapely.geometry
```
**A:** 有2种方法,第一种通过 pip3 安装shapely,第二种通过 pip3 安装所有依赖组件[requirements.txt](https://github.com/PaddlePaddle/Serving/blob/develop/python/requirements.txt)
```
方法1:
pip3 install shapely==1.7.0
方法2:
pip3 install -r python/requirements.txt
```
<a name="4"></a>
## 编译问题
#### Q: 如何使用自己编译的 Paddle Serving 进行预测?
**A:** 编译 Paddle Serving 请阅读[编译 Serving](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/Compile_CN.md)
#### Q: 使用 Java 客户端,mvn compile 过程出现 "No compiler is provided in this environment. Perhaps you are running on a JRE rather than a JDK?" 错误
**A:** 没有安装 JDK,或者 `JAVA_HOME` 路径配置错误(正确配置是 JDK 路径,常见错误配置成 JRE 路径,例如正确路径参考 `JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.262.b10-0.el7_8.x86_64/"`)。Java JDK 安装参考 https://segmentfault.com/a/1190000015389941。
#### Q: 编译过程报错 /usr/local/bin/ld: cannot find -lbz2
```
/usr/local/bin/ld: cannot find -lbz2
collect2: error: ld returned 1 exit status
core/general-server/CMakeFiles/serving.dir/build.make:276: recipe for target 'core/general-server/serving' failed
make[2]: *** [core/general-server/serving] Error 1
CMakeFiles/Makefile2:1181: recipe for target 'core/general-server/CMakeFiles/serving.dir/all' failed
make[1]: *** [core/general-server/CMakeFiles/serving.dir/all] Error 2
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
```
**A:** Ubuntu 系统运行命令安装 libbz2: `apt install libbz2-dev`
<a name="5"></a>
## 环境问题
#### Q:程序运行出现 `CXXABI` 相关错误。
错误原因是编译 Python 使用的 GCC 版本和编译 Serving 的 GCC 版本不一致。对于 Docker 用户,推荐使用[Docker容器](https://github.com/PaddlePaddle/Serving/blob/develop/doc/Docker_Images_CN.md),由于 Docker 容器内的 Python 版本与 Serving 在发布前都做过适配,这样就不会出现类似的错误。
推荐使用 GCC 8.2 预编译包 [Python3.6](https://paddle-serving.bj.bcebos.com/others/Python3.6.10-gcc82.tar) 。下载解压后,需要将对应的目录设置为 `PYTHONROOT`,并设置 `PATH``LD_LIBRARY_PATH`
```bash
export PYTHONROOT=/path/of/python # 对应解压后的Python目录
export PATH=$PYTHONROOT/bin:$PATH
export LD_LIBRARY_PATH=$PYTHONROOT/lib:$LD_LIBRARY_PATH
```
#### Q:遇到 `libstdc++.so.6` 的版本不够的问题
触发该问题的原因在于,编译 Paddle Serving 相关可执行程序和动态库,所采用的是 GCC 8.2(Cuda 9.0 和 10.0 的 Server 可执行程序受限 CUDA 兼容性采用 GCC 4.8编译)。Python 在调用的过程中,有可能链接到了其他 GCC 版本的 `libstdc++.so`。 需要做的就是受限确保所在环境具备 GCC 8.2,其次将 GCC8.2 的`libstdc++.so.*`拷贝到某个目录例如`/home/libstdcpp` 下。最后 `export LD_LIBRARY_PATH=/home/libstdcpp:$LD_LIBRARY_PATH` 即可。
#### Q: 遇到 `OPENSSL_1.0.1EC` 符号找不到的问题。
目前 Serving 的可执行程序和客户端动态库需要链接 `1.0.2k` 版本的 `openssl` 动态库。如果环境当中没有,可以执行
```bash
wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && \
tar xf centos_ssl.tar && rm -rf centos_ssl.tar && \
mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && \
ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && \
ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && \
ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && \
ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so
```
其中 `/usr/lib` 可以换成其他目录,并确保该目录在 `LD_LIBRARY_PATH` 下。
### GPU相关环境问题
#### Q:需要做哪些检查确保 Serving 可以运行在 GPU 环境
**注:如果是使用 Serving 提供的镜像不需要做下列检查,如果是其他开发环境可以参考以下指导。**
首先需要确保`nvidia-smi`可用,其次需要确保所需的动态库so文件在`LD_LIBRARY_PATH`所在的目录(包括系统lib库)。
(1)CUDA 显卡驱动:文件名通常为 `libcuda.so.$DRIVER_VERSION` 例如驱动版本为440.10.15,文件名就是 `libcuda.so.440.10.15`
(2)CUDA 和 cuDNN 动态库:文件名通常为 `libcudart.so.$CUDA_VERSION`,和 `libcudnn.so.$CUDNN_VERSION`。例如 CUDA9 就是 `libcudart.so.9.0`,Cudnn7就是 `libcudnn.so.7`。CUDA 和 cuDNN 与 Serving 的版本匹配参见[Serving所有镜像列表](Docker_Images_CN.md#%E9%99%84%E5%BD%95%E6%89%80%E6%9C%89%E9%95%9C%E5%83%8F%E5%88%97%E8%A1%A8).
(3) CUDA 10.1及更高版本需要 TensorRT。安装 TensorRT 相关文件的脚本参考 [install_trt.sh](../tools/dockerfiles/build_scripts/install_trt.sh).
<a name="6"></a>
## 部署问题
#### Q: GPU 环境运行 Serving 报错,GPU count is: 0。
```
terminate called after throwing an instance of 'paddle::platform::EnforceNotMet'
what():
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&, char const*, int)
1 paddle::platform::SetDeviceId(int)
2 paddle::AnalysisConfig::fraction_of_gpu_memory_for_pool() const
3 std::unique_ptr<paddle::PaddlePredictor, std::default_delete<paddle::PaddlePredictor> > paddle::CreatePaddlePredictor<paddle::AnalysisConfig, (paddle::PaddleEngineKind)2>(paddle::AnalysisConfig const&)
4 std::unique_ptr<paddle::PaddlePredictor, std::default_delete<paddle::PaddlePredictor> > paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(paddle::AnalysisConfig const&)
----------------------
Error Message Summary:
----------------------
InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
[Hint: Expected id < GetCUDADeviceCount(), but received id:0 >= GetCUDADeviceCount():0.] at (/home/scmbuild/workspaces_cluster.dev/baidu.lib.paddlepaddle/baidu/lib/paddlepaddle/Paddle/paddle/fluid/platform/gpu_info.cc:211)
```
**A:** 原因是 `libcuda.so` 没有链接成功。首先在机器上找到 `libcuda.so`,使用 `ldd` 命令检查 libnvidia 版本与 nvidia-smi 中版本是否一致(libnvidia-fatbinaryloader.so.418.39,与NVIDIA-SMI 418.39 Driver Version: 418.39),然后用 export 导出 `libcuda.so` 的路径即可(例如 libcuda.so 在 /usr/lib64/,export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib64/)
#### Q: 遇到 GPU not found, please check your environment or use cpu version by "pip install paddle_serving_server"
**A:** 检查环境中是否有N卡:`ls /dev/ | grep nvidia`
#### Q: Paddle Serving 支持哪些镜像环境?
**A:** 支持 CentOS 和 Ubuntu 环境镜像 ,完整列表查阅[这里](https://github.com/PaddlePaddle/Serving/blob/develop/doc/Docker_Images_CN.md)
#### Q: Paddle Serving 是否支持本地离线安装
**A:** 支持离线部署,需要把一些相关的[依赖包](https://github.com/PaddlePaddle/Serving/blob/develop/doc/Compile_CN.md) 提前准备安装好
#### Q: Docker 中启动 Server IP地址 127.0.0.1 与 0.0.0.0 差异
**A:** 必须将容器的主进程设置为绑定到特殊的 `0.0.0.0` 表示“所有接口”地址,否则它将无法从容器外部访问。在 Docker 中 `127.0.0.1` 仅代表“这个容器”,而不是“这台机器”。如果您从容器建立到 `127.0.0.1` 的出站连接,它将返回到同一个容器;如果您将服务器绑定到 `127.0.0.1`,接收不到来自外部的连接。
<a name="7"></a>
## 预测问题
#### Q: 使用 GPU 第一次预测时特别慢,如何调整 RPC 服务的等待时间避免超时?
**A:** GPU 第一次预测需要初始化。使用 `set_rpc_timeout_ms` 设置更长的等待时间,单位为毫秒,默认时间为20秒。
示例:
```
from paddle_serving_client import Client
client = Client()
client.load_client_config(sys.argv[1])
client.set_rpc_timeout_ms(100000)
client.connect(["127.0.0.1:9393"])
```
#### Q: 执行 GPU 预测时遇到 `ExternalError: Cudnn error, CUDNN_STATUS_BAD_PARAM at (../batch_norm_op.cu:198)`错误
**A:** 将 cuDNN 的 lib64路径添加到 `LD_LIBRARY_PATH`,安装自 `pypi` 的 Paddle Serving 中 `post9` 版本使用的是 `cuDNN 7.3,post10` 使用的是 `cuDNN 7.5。如果是使用自己编译的 Paddle Serving,可以在 `log/serving.INFO` 日志文件中查看对应的 cuDNN 版本。
#### Q: 执行 GPU 预测时遇到 `Error: Failed to find dynamic library: libcublas.so`
**A:** 将 CUDA 的 lib64路径添加到 `LD_LIBRARY_PATH`, post9 版本的 Paddle Serving 使用的是 `cuda 9.0,post10` 版本使用的 `cuda 10.0`。
#### Q: Client 的 `fetch var`变量名如何设置
**A:** 通过[保存用于 Serving 部署的模型参数](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/Save_EN.md) 生成配置文件 `serving_server_conf.prototxt`,获取需要的变量名。
#### Q: 如何使用多语言客户端
**A:** 多语言客户端要与多语言服务端配套使用。当前版本下(0.8.3)
#### Q: 如何在 Windows 下使用 Paddle Serving
**A:** 在 Windows 上可以运行多语言 RPC 客户端,或使用 HTTP 方式访问。
#### Q: 报错信息 `libnvinfer.so: cannot open shared object file: No such file or directory)`
**A:** 没有安装 TensorRT,安装 TensorRT 请参考链接: https://blog.csdn.net/hesongzefairy/article/details/105343525
<a name="8"></a>
## 日志排查
#### Q: 部署和预测中的日志信息在哪里查看?
**A:** Server 的日志分为两部分,一部分打印到标准输出,一部分打印到启动服务时的目录下的 `log/serving.INFO` 文件中。
Client 的日志直接打印到标准输出。
通过在部署服务之前 'export GLOG_v=3'可以输出更为详细的日志信息。
#### Q: C++ Serving 启动成功后,日志文件在哪里,在哪里设置日志级别?
**A:** C++ Serving 服务的所有日志在程序运行的当前目录的`log/`目录下,分为 serving.INFO、serving.WARNING 和 serving.ERROR 文件。
1)警告是 `glog` 组件打印的,告知 `glog` 初始化之前日志打印在 STDERR;
2)一般采用 `GLOG_v` 方式启动服务同时设置日志级别。
例如:
```
GLOG_v=2 python -m paddle_serving_server.serve --model xxx_conf/ --port 9999
```
#### Q: Python Pipeline 启动成功后,日志文件在哪里,在哪里设置日志级别?
**A:** Python Pipeline 服务的日志信息请阅读[Python Pipeline 设计](./7-1_Python_Pipeline_Design_CN.md) 第三节服务日志。
#### Q: (GLOG_v=2下)Server 日志一切正常,但 Client 始终得不到正确的预测结果
**A:** 可能是配置文件有问题,检查下配置文件(is_load_tensor,fetch_type等有没有问题)
#### Q: 如何给 Server 传递 Logid
**A:** Logid 默认为0,Client 通过在 predict 函数中指定 log_id 参数
#### Q: C++ Serving 出现问题如何调试和定位
**A:** 推荐您使用 GDB 进行定位和调试,如果您使用 Serving 的 Docker,在启动容器时候,需要加上 `docker run --privileged `参数,开启特权模式,这样才能在 docker 容器中使用 GDB 定位和调试
如果 C++ Serving 出现 `core dump`,一般会生成 core 文件,若没有,运行 `ulimit -c unlimited`命令开启core dump。
使用 GDB 调试 core 文件的方法为:`gdb <可执行文件> <core文件>`,进入后输入 `bt` 指令显示栈信息。
注意:可执行文件路径是 C++ bin 文件的路径,而不是 python 命令,一般为类似下面的这种 `/usr/local/lib/python3.6/site-packages/paddle_serving_server/serving-gpu-102-0.7.0/serving`
# 快速开始案例部署
您可以通过以下 Paddle Serving 快速开始案例,分别了解到 C++ Serving 与 Python Pipeline 2种框架的部署方法。
- [使用 C++ Serving 部署 Resnet50 模型案例]()
- [使用 Python Pipeline 部署 OCR 模型案例]()
- [使用 C++ Serving 部署 Resnet50 模型案例](./3-1_QuickStart_Cpp_Resnet_CN.md)
- [使用 Python Pipeline 部署 OCR 模型案例](./3-2_QuickStart_Pipeline_OCR_CN.md)
通过阅读以下内容掌握 Paddle Serving 基础功能以及2种框架特性和使用指南:
- [基础功能]()
- [进阶 C++ Serving 介绍]()
- [进阶 Python Pipeline 介绍]()
- [进阶 C++ Serving 介绍](./doc/Offical_Docs/6-0_C++_Serving_Advanced_Introduction_CN.md)
- [进阶 Python Pipeline 介绍](./7-0_Python_Pipeline_Int_CN.md)
# C++ Serving 快速部署案例
- [模型介绍](#1)
- [部署步骤](#2)
- [2.1 保存模型](#2.1)
- [2.2 保存 Serving 部署的模型参数](#2.2)
- [2.3 启动服务](#2.3)
- [2.4 启动客户端](#2.4)
<a name="1"></a>
## 模型介绍
残差网络(ResNet)于2015年被提出,摘得 ImageNet 榜单5项第一,成绩大幅领先第二名,是 CNN 图像史上的一个里程碑。
从经验上看,网络结构层数越多,有利于复杂特征的提取,从理论上讲会取得更好的结果。但是,随着网络层数的增加,准确率会趋于饱和甚至会下降,称为退化问题(Degradation problem)。其根本原因是深层网络出现梯度消失或者梯度爆炸的问题。残差网络利用短路机制加入了残差单元,解决了退化问题。
ResNet 网络是参考了 VGG19 网络,加入残差单元,ResNet50 有50层网络。
<a name="2"></a>
## 部署步骤
前提条件是你已完成[环境安装](./2-0_Index_CN.md)步骤,并已验证环境安装成功,此处不在赘述。
克隆 Serving 仓库后,进入 `examples/C++/PaddleClas/resnet_50_vd` 目录下,已提供程序、配置和性能测试脚本。
```
git clone https://github.com/PaddlePaddle/Serving
```
按以下5个步骤操作即可实现模型部署。
- 一.获取模型
- 二.保存 Serving 部署的模型参数
- 三.启动服务
- 四.启动客户端
<a name="2.1"></a>
**一.获取模型**
下载 `ResNet50_vd` 的 推理模型,更多模型信息请阅读[ImageNet 预训练模型库](https://github.com/PaddlePaddle/PaddleClas/blob/8fa820f5c81edb1e7a2b222306a307bc27bff90f/docs/zh_CN/algorithm_introduction/ImageNet_models.md)
```
wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/ResNet50_vd_infer.tar && tar xf ResNet50_vd_infer.tar
```
<a name="2.2"></a>
**二.保存 Serving 部署的模型参数**
`paddle_serving_client` 把下载的推理模型转换成易于 Serving 部署的模型格式,完整信息请参考 [保存用于 Serving 部署的模型参数](./5-1_Save_Model_Params_CN.md)
```
python3 -m paddle_serving_client.convert --dirname ./ResNet50_vd_infer/ \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
--serving_server ./ResNet50_vd_serving/ \
--serving_client ./ResNet50_vd_client/
```
保存参数后,会在当前文件夹多出 `ResNet50_vd_serving``ResNet50_vd_client` 的文件夹,分别用户服务端和客户端。
```
├── daisy.jpg
├── http_client.py
├── imagenet.label
├── ResNet50_vd_client
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── ResNet50_vd_infer
│   ├── inference.pdiparams
│   ├── inference.pdiparams.info
│   └── inference.pdmodel
├── ResNet50_vd_serving
│   ├── fluid_time_file
│   ├── inference.pdiparams
│   ├── inference.pdmodel
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── rpc_client.py
```
<a name="2.3"></a>
**三.启动服务**
C++ Serving 服务可以指定一个网络端口同时接收 HTTP、gRPC 和 bRPC 请求。命令参数 `--model` 指定模型路径,`--gpu_ids` 指定 GPU 卡,`--port` 指定端口。
```
python3 -m paddle_serving_server.serve --model ResNet50_vd_serving --gpu_ids 0 --port 9394
```
<a name="2.4"></a>
**四.启动客户端**
HTTP 客户端程序 `http_client.py` 创建请求参数,向服务端发起 HTTP 请求。
```
python3 http_client.py
```
RPC 客户端程序 `rpc_client.py` 创建请求参数,向服务端发起 gRPC 请求。
```
python3 rpc_client.py
```
成功运行后,模型预测的结果会打印如下:
```
prediction: daisy, probability: 0.9341399073600769
```
# Python Pipeline 快速部署案例
- [模型介绍](#1)
- [部署步骤](#2)
- [获取模型与保存模型参数](#2.1)
- [保存 Serving 部署的模型参数](#2.2)
- [下载测试数据集(可选)](#2.3)
- [修改配置文件(可选)](#2.4)
- [代码与配置信息绑定](#2.5)
- [启动服务与验证](#2.6)
Python Pipeline 框架使用 Python 语言开发,是一套端到端多模型组合服务编程框架,旨在降低编程门槛,提高资源使用率(尤其是GPU设备),提升整体服务的预估效率。详细设计参考[ Python Pipeline 设计与使用]()
<a name="1"></a>
## 模型介绍
OCR 技术一般指光学字符识别。 OCR(Optical Character Recognition,光学字符识别)是指电子设备(例如扫描仪或数码相机)检查纸上打印的字符,通过检测暗、亮的模式确定其形状,然后用字符识别方法将形状翻译成计算机文字的过程。
......@@ -16,8 +28,9 @@ PaddleOCR 提供的 PP-OCR 系列模型覆盖轻量级服务端、轻量级移
| 中英文超轻量移动端模型 | 9.4M | ch_ppocr_mobile_v2.0_xx | 移动端|
| 中英文通用服务端模型 | 143.4M | ch_ppocr_server_v2.0_xx | 服务器端 |
<a name="2"></a>
## 模型步骤
## 部署步骤
前提条件是你已完成[环境安装]()步骤,并已验证环境安装成功,此处不在赘述。
......@@ -25,13 +38,20 @@ PaddleOCR 提供的 PP-OCR 系列模型覆盖轻量级服务端、轻量级移
```
git clone https://github.com/PaddlePaddle/Serving
```
按以下5个步骤操作即可实现 OCR 示例部署。
通过6个步骤操作即可实现 OCR 示例部署。
- 一.获取模型
- 二.保存 Serving 部署的模型参数
- 三.下载测试数据集(可选)
- 四.修改 `config.yml` 配置(可选)
- 五.代码与配置信息绑定
- 六.启动服务与验证
<a name="2.1"></a>
**一.获取模型**
**一.获取模型与保存模型参数**
本章节选用中英文超轻量模型 ch_PP-OCRv2_xx 制作部署案例,模型体积小,效果很好,属于性价比很高的选择。
为了节省大家的时间,已将预训练模型使用[保存模型服务化参数]()方法打包成压缩包,下载并解压即可使用。如你自训练的模型需经过保存模型服务化参数步骤才能服务化部署。
```
python3 -m paddle_serving_app.package --get_model ocr_rec
tar -xzvf ocr_rec.tar.gz
......@@ -39,15 +59,27 @@ python3 -m paddle_serving_app.package --get_model ocr_det
tar -xzvf ocr_det.tar.gz
```
**二.下载测试数据集(可选)**
第二步,下载测试图片集,如使用自有测试数据集,可忽略此步骤。
<a name="2.2"></a>
**二.保存 Serving 部署的模型参数**
为了节省大家的时间,已将预训练模型使用[保存用于 Serving 部署的模型参数](./5-1_Save_Model_Params_CN.md)方法打包成压缩包,下载并解压即可使用。如你自训练的模型需经过保存模型服务化参数步骤才能服务化部署。
<a name="2.3"></a>
**三.下载测试数据集(可选)**
下载测试图片集,如使用自有测试数据集,可忽略此步骤。
```
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/ocr/test_imgs.tar
tar xf test_imgs.tar
```
**三.修改 Config.yml 配置(可选)**
第三步,通过修改配置文件设置服务、图、OP 级别属性。如果使用默认配置,此步骤可忽略。
<a name="2.4"></a>
**四.修改配置文件(可选)**
修改配置文件 `config.yml` 设置服务、图、OP 级别属性。如果使用默认配置,此步骤可忽略。
由于配置项较多,仅重点介绍部分核心选项的使用,完整配置选项说明可参考[ 配置说明]()
```
......@@ -155,8 +187,10 @@ op:
#min_subgraph_size: 3
```
<a name="2.5"></a>
**五.代码与配置信息绑定**
**四.代码与配置信息绑定 **
第四步,实现代码和配置文件 Config.yml 绑定,以及设置多模型组合关系。具体包括:
1. 重写模型前后处理:
......@@ -199,15 +233,52 @@ ocr_service.prepare_pipeline_config("config.yml")
ocr_service.run_service()
```
<a name="2.6"></a>
**.启动服务与验证**
**.启动服务与验证**
运行程序 `web_service.py` 启动服务端
启动服务前,可看到程序路径下所有文件路径如下:
```
.
├── 7.jpg
├── benchmark.py
├── benchmark.sh
├── config.yml
├── imgs
│   └── ggg.png
├── ocr_det_client
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── ocr_det_model
│   ├── inference.pdiparams
│   ├── inference.pdmodel
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── ocr_rec_client
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── ocr_rec_model
│   ├── inference.pdiparams
│   ├── inference.pdmodel
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── pipeline_http_client.py
├── pipeline_rpc_client.py
├── ppocr_keys_v1.txt
├── ProcessInfo.json
├── README_CN.md
├── README.md
└── web_service.py
```
运行服务程序 `web_service.py` 启动服务端,接收客户端请求,采用图执行引擎执行推理预测。
```
# Run Server
python3 web_service.py &>log.txt &
```
客户端程序 `pipeline_http_client.py` 注册服务端地址,并发送客户端请求。
启动客户端前,要确认 URL://{ip}:{port}/{name}/{method} 。本项目中 {name} 即是 web_service.py 中 OcrService name 参数 "ocr"。 {method} 默认为 "prediction"
```
......
# 模型库
- [模型分类](#1)
- [1.1 图像分类与识别](#1.1)
- [1.2 文本类](#1.2)
- [1.3 推荐系统](#1.3)
- [1.4 人脸识别](#1.4)
- [1.5 目标检测](#1.5)
- [1.6 文字识别](#1.6)
- [1.7 图像分割](#1.7)
- [1.8 关键点检测](#1.8)
- [1.9 视频理解](#1.9)
- [模型示例库](#2)
Paddle Serving 已实现9个类别,共计46个模型的服务化部署示例。
**一.图像分类与识别**模型部署示例请参阅下表:
<a name="1"></a>
## 模型分类
<a name="1.1"></a>
**一.图像分类与识别**
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| 图像识别 |pp_shitu | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/pp_shitu) | [.tar.gz](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/inference/serving/pp_shitu.tar.gz) |
......@@ -22,7 +42,11 @@ Paddle Serving 已实现9个类别,共计46个模型的服务化部署示例
---
**二.文本类**模型部署示例请参阅下表:
<a name="1.2"></a>
**二.文本类**
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| 文本生成 | bert_chinese_L-12_H-768_A-12 | PaddleNLP | [C++ Serving](../examples/C++/PaddleNLP/bert)</br>[Pipeline Serving](../examples/Pipeline/PaddleNLP/bert) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz) |
......@@ -33,8 +57,11 @@ Paddle Serving 已实现9个类别,共计46个模型的服务化部署示例
| 抽取文本向量| In-batch Negatives | PaddleNLP | [Pipeline Serving](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search/recall/in_batch_negative) | [model](https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip) |
---
<a name="1.3"></a>
**三.推荐系统**
**三.推荐系统**模型部署示例请参阅下表:
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| CTR预估 | criteo_ctr | PaddleRec | [C++ Serving](../examples/C++/PaddleRec/criteo_ctr) | [.tar.gz](https://paddle-serving.bj.bcebos.com/criteo_ctr_example/criteo_ctr_demo_model.tar.gz) |
......@@ -42,15 +69,21 @@ Paddle Serving 已实现9个类别,共计46个模型的服务化部署示例
| 内容推荐 | wide&deep | PaddleRec | [C++ Serving](https://github.com/PaddlePaddle/PaddleRec/blob/release/2.1.0/doc/serving.md) | [model](https://github.com/PaddlePaddle/PaddleRec/blob/release/2.1.0/models/rank/wide_deep/README.md) |
---
<a name="1.4"></a>
**四.人脸识别**模型部署示例请参阅下表:
**四.人脸识别**
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| 人脸识别|blazeface | PaddleDetection | [C++ Serving](../examples/C++/PaddleDetection/blazeface) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/ObjectDetection/blazeface.tar.gz) |C++ Serving|
---
<a name="1.5"></a>
**五.目标检测**
**五.目标检测**模型部署示例请参阅下表:
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| 目标检测 |cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco | PaddleDetection | [C++ Serving](../examples/C++/PaddleDetection/cascade_rcnn) | [.tar.gz](https://paddle-serving.bj.bcebos.com/pddet_demo/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco_serving.tar.gz) |
......@@ -65,8 +98,11 @@ Paddle Serving 已实现9个类别,共计46个模型的服务化部署示例
| 目标检测 |YOLOv3-DarkNet | PaddleDetection | [C++ Serving](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.3/deploy/serving) | [.pdparams](https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams)</br>[.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/configs/yolov3/yolov3_darknet53_270e_coco.yml) |
---
<a name="1.6"></a>
**六.文字识别**模型部署示例请参阅下表:
**六.文字识别**
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| 文字识别 |ocr_rec | PaddleOCR | [C++ Serving](../examples/C++/PaddleOCR/ocr)</br>[Pipeline Serving](../examples/Pipeline/PaddleOCR/ocr) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/OCR/ocr_rec.tar.gz) |
......@@ -79,27 +115,40 @@ Paddle Serving 已实现9个类别,共计46个模型的服务化部署示例
| 文字识别 |ch_ppocr_server_v2.0 | PaddleOCR | [Pipeline Serving](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/deploy/pdserving/README.md) | [model](https://github.com/PaddlePaddle/PaddleOCR) |
---
<a name="1.7"></a>
**七.图像分割**
**七.图像分割**模型部署示例请参阅下表:
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| 图像分割 | deeplabv3 | PaddleSeg | [C++ Serving](../examples/C++/PaddleSeg/deeplabv3) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/ImageSegmentation/deeplabv3.tar.gz) |
| 图像分割 | unet | PaddleSeg | [C++ Serving](../examples/C++/PaddleSeg/unet_for_image_seg) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/ImageSegmentation/unet.tar.gz) |
---
<a name="1.8"></a>
**八.关键点检测**模型部署示例请参阅下表:
**八.关键点检测**
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| 关键点检测 |faster_rcnn_hrnetv2p_w18_1x | PaddleDetection | [C++ Serving](../examples/C++/PaddleDetection/faster_rcnn_hrnetv2p_w18_1x) | [.tar.gz](https://paddle-serving.bj.bcebos.com/pddet_demo/faster_rcnn_hrnetv2p_w18_1x.tar.gz) |
---
<a name="1.9"></a>
**九.视频理解**
**九.视频理解**模型部署示例请参阅下表:
模型部署示例请参阅下表:
| 场景| 模型 | 类型 | 示例使用的框架 | 下载 |
| --- | --- | --- | --- | ---- |
| 视频理解 |PPTSN_K400 | PaddleVideo | [Pipeline Serving](../examples/Pipeline/PaddleVideo/PPTSN_K400) | [model](https://paddle-serving.bj.bcebos.com/model/PaddleVideo/PPTSN_K400.tar) |
---
<a name="2"></a>
## 模型示例库
Paddle Serving 代码库下模型部署示例请参考 [examples](../examples) 目录。更多 Paddle Serving 部署模型请参考 [wholechain](https://www.paddlepaddle.org.cn/wholechain)
......
# 保存用于 Serving 部署的模型参数
- [背景介绍](#1)
- [功能设计](#2)
- [功能使用](#3)
- [PYTHON 命令执行](#3.1)
- [代码引入执行](#3.2)
- [Serving 部署](#4)
- [服务端部署示例](#4.1)
- [客户端部署示例](#4.2)
<a name="1"></a>
## 背景介绍
模型参数信息保存在模型文件中,为什么还要保存用于 Paddle Serving 部署的模型参数呢,原因有3个:
1. 服务化场景分为客户端和服务端,服务端加载模型,而在客户端没有模型信息,但需要在客户端需实现数据拼装和类型转换。
2. 模型升级过程中 `feed vars``fetch vars` 的名称变化会导致代码升级,通过增加一个 `alias_name` 字段映射名称,代码无需升级。
3. 部署 `Web` 服务,并使用 `URL` 方式访问时,请求信息中缺少类型和维度信息,在服务端推理前需要进行转换。
<a name="2"></a>
## 功能设计
飞桨训推一体框架中,从动态图模型训练到静态图推理部署,一体化流程如下所示
```
①动态图训练 → ②模型动转静 → ③静态模型 → ④模型保存 → ⑤Serving 部署
```
在飞桨框架2.1对模型与参数的保存与载入相关接口进行了梳理,完整文档参考[模型保存与载入](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/08_model_save_load_cn.html)
- 对于训练调优场景,我们推荐使用 `paddle.save/load` 保存和载入模型;
- 对于推理部署场景,我们推荐使用 `paddle.jit.save/load`(动态图)和 `paddle.static.save/load_inference_model` (静态图)保存载入模型;
Paddle Serving 模型参数保存接口定位是在 `②模型动转静` 导出 `③静态模型`后,使用 `paddle.static.load_inference_model` 接口加载模型,和 `paddle.static.save_vars` 接口保存模型参数。
生成的模型参数信息保存在 `paddle_serving_server/client.prototxt` 文件中,其格式如下
```
feed_var {
name: "x"
alias_name: "image"
is_lod_tensor: false
feed_type: 1
shape: 3
shape: 960
shape: 960
}
fetch_var {
name: "save_infer_model/scale_0.tmp_1"
alias_name: "save_infer_model/scale_0.tmp_1"
is_lod_tensor: false
fetch_type: 1
shape: 1
shape: 960
shape: 960
}
```
| 参数 | 描述 |
|------|---------|
| name | 实际变量名 |
| alias_name | 变量别名,与 name 的关联业务场景中变量名 |
| is_lod_tensor | 是否为 LOD Tensor |
| feed_type | feed 变量类型|
| fetch_type | fetch 变量类型|
| shape 数组 | 变量的 Shape 信息 |
feed 与 fetch 变量的类型列表如下:
| 类型 | 类型值 |
|------|------|
| int64 | 0 |
| float32 |1 |
| int32 | 2 |
| float64 | 3 |
| int16 | 4 |
| float16 | 5 |
| bfloat16 | 6 |
| uint8 | 7 |
| int8 | 8 |
| bool | 9 |
| complex64 | 10
| complex128 | 11 |
<a name="3"></a>
## 功能使用
Paddle 推理模型有3种形式,每种形式的读模型的方式都不同,散列方式必须以路径方式加载,其余2种采用目录或文件方式均可。
1) Paddle 2.0前版本:`__model__`, `__params__`
2) Paddle 2.0后版本:`*.pdmodel`, `*.pdiparams`
3) 散列:`__model__`, `conv2d_1.w_0`, `conv2d_2.w_0`, `fc_1.w_0`, `conv2d_1.b_0`, ...
`paddle_serving_client.convert` 接口既支持 PYTHON 命令方式执行,又支持 代码中引入运行。
| 参数 | 类型 | 默认值 | 描述 |
|--------------|------|-----------|--------------------------------|
| `dirname` | str | - | 需要转换的模型文件存储路径,Program结构文件和参数文件均保存在此目录。|
| `serving_server` | str | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
| `serving_client` | str | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client |
| `model_filename` | str | None | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None,则使用 `__model__` 作为默认的文件名 |
| `params_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为None |
<a name="3.1"></a>
**一.PYTHON 命令执行**
首先需要安装 `paddle_serivng_client` 包,以目录方式加载模型。
示例一,是以模型路径方式加载模型,适用于全部3种类型。
```python
python3 -m paddle_serving_client.convert --dirname ./your_inference_model_dir
```
示例二,以指定加载 `当前路径` 下模型 `dygraph_model.pdmodel``dygraph_model.pdiparams`,保存结果在 `serving_server``serving_client` 目录。
```python
python3 -m paddle_serving_client.convert --dirname . --model_filename dygraph_model.pdmodel --params_filename dygraph_model.pdiparams --serving_server serving_server --serving_client serving_client
```
<a name="3.2"></a>
**二.代码引入执行**
代码引入执行方式,通过 `import io` 包并调用 `inference_model_to_serving` 实现模型参数保存。
```python
import paddle_serving_client.io as serving_io
serving_io.inference_model_to_serving(dirname, serving_server="serving_server", serving_client="serving_client", model_filename=None, params_filename=None)
```
<a name="4"></a>
## Serving 部署
生成完的模型可直接用于服务化推理,服务端使用和客户端使用。
<a name="4.1"></a>
**一.服务端部署示例**
示例一:C++ Serving 启动服务
```
python3 -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_id 0
```
示例二:Python Pipeline 启动服务,在 `config.yml` 中指定模型路径
```
op:
det:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 6
#当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
#client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
#det模型路径
model_config: ocr_det_model
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["save_infer_model/scale_0.tmp_1"]
# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 0
```
<a name="4.2"></a>
**二.客户端部署示例**
通过 `client` 对象的 `load_client_config` 接口加载模型配置信息
```
from paddle_serving_client import Client
from paddle_serving_app.reader import Sequential, File2Image, Resize, CenterCrop
from paddle_serving_app.reader import RGB2BGR, Transpose, Div, Normalize
client = Client()
client.load_client_config(
"serving_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9393"])
seq = Sequential([
File2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose((2, 0, 1)),
Div(255), Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True)
])
image_file = "daisy.jpg"
img = seq(image_file)
fetch_map = client.predict(feed={"inputs": img}, fetch=["save_infer_model/scale_0.tmp_0"])
```
# Serving 配置
## 简介
本文主要介绍 C++ Serving 以及 Python Pipeline 的各项配置:
- [模型配置文件](#模型配置文件): 转换模型时自动生成,描述模型输入输出信息
- [C++ Serving](#c-serving): 用于高性能场景,介绍了快速启动以及自定义配置方法
- [Python Pipeline](#python-pipeline): 用于单算子多模型组合场景
## 模型配置文件
在开始介绍 Server 配置之前,先来介绍一下模型配置文件。我们在将模型转换为 PaddleServing 模型时,会生成对应的 serving_client_conf.prototxt 以及 serving_server_conf.prototxt,两者内容一致,为模型输入输出的参数信息,方便用户拼装参数。该配置文件用于 Server 以及 Client,并不需要用户自行修改。转换方法参考文档《[怎样保存用于Paddle Serving的模型](./Save_CN.md)》。protobuf 格式可参考 `core/configure/proto/general_model_config.proto`
样例如下:
```
feed_var {
name: "x"
alias_name: "x"
is_lod_tensor: false
feed_type: 1
shape: 13
}
fetch_var {
name: "concat_1.tmp_0"
alias_name: "concat_1.tmp_0"
is_lod_tensor: false
fetch_type: 1
shape: 3
shape: 640
shape: 640
}
```
其中
- feed_var:模型输入
- fetch_var:模型输出
- name:名称
- alias_name:别名,与名称对应
- is_lod_tensor:是否为 lod,具体可参考《[Lod字段说明](./LOD_CN.md)
- feed_type:数据类型,详见表格
- shape:数据维度
| feet_type | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 20 |
|-----------|-------|---------|-------|-------|------|------|-------|-------|-----|--------|
| 类型 | INT64 | FLOAT32 | INT32 | FP64 | INT16 | FP16 | BF16 | UINT8 | INT8 | STRING |
## C++ Serving
**一. 快速启动与关闭**
可以通过配置模型及端口号快速启动服务,启动命令如下:
```BASH
python3 -m paddle_serving_server.serve --model serving_model --port 9393
```
该命令会自动生成配置文件,并使用生成的配置文件启动 C++ Serving。例如上述启动命令会自动生成 workdir_9393 目录,其结构如下
```
workdir_9393
├── general_infer_0
│   ├── fluid_time_file
│   ├── general_model.prototxt
│   └── model_toolkit.prototxt
├── infer_service.prototxt
├── resource.prototxt
└── workflow.prototxt
```
更多启动参数详见下表:
| Argument | Type | Default | Description |
| ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
| `thread` | int | `2` | BRPC 服务的线程数 |
| `runtime_thread_num` | int[]| `0` | 异步模式下每个模型的线程数 |
| `batch_infer_size` | int[]| `32` | 异步模式下每个模型的 Batch 数 |
| `gpu_ids` | str[]| `"-1"` | 设置每个模型的 GPU id,例如当使用多卡部署时,可设置 "0,1,2" |
| `port` | int | `9292` | 服务的端口号 |
| `model` | str[]| `""` | 模型文件路径,例如包含两个模型时,可设置 "serving_model_1 serving_model_2" |
| `mem_optim_off` | - | - | 是否关闭内存优化选项 |
| `ir_optim` | bool | False | 是否开启图优化 |
| `use_mkl` (Only for cpu version) | - | - | 开启 MKL 选项,需要与 ir_optim 配合使用 |
| `use_trt` (Only for trt version) | - | - | 开启 TensorRT,需要与 ir_optim 配合使用 |
| `use_lite` (Only for Intel x86 CPU or ARM CPU) | - | - | 开启 PaddleLite,需要与 ir_optim 配合使用 |
| `use_xpu` | - | - | 开启百度昆仑 XPU 配置,需要与 ir_optim 配合使用 |
| `precision` | str | FP32 | 精度配置,支持 FP32, FP16, INT8 |
| `use_calib` | bool | False | 是否开启 TRT int8 校准模式 |
| `gpu_multi_stream` | bool | False | 是否开启 GPU 多流模式 |
| `use_ascend_cl` | bool | False | 开启昇腾配置,单独开启时适配 910,与 use_lite 共同开启时适配 310 |
| `request_cache_size` | int | `0` | 请求缓存的容量大小。默认为 0 时,缓存关闭 |
**二. 自定义配置启动**
一般情况下,自动生成的配置可以应对大部分场景。对于特殊场景,用户也可自行定义配置文件。这些配置文件包括 service.prototxt(配置服务列表)、workflow.prototxt(配置 OP 流程 workflow)、resource.prototxt(指定模型配置文件)、model_toolkit.prototxt(配置模型信息和预测引擎)、proj.conf(配置服务参数)。启动命令如下:
```BASH
/bin/serving --flagfile=proj.conf
```
1. proj.conf
proj.conf 用于传入服务参数,并指定了其他相关配置文件的路径。如果重复传入参数,则以最后序参数值为准。
```
# for paddle inference
--precision=fp32
--use_calib=False
--reload_interval_s=10
# for brpc
--max_concurrency=0
--num_threads=10
--bthread_concurrency=10
--max_body_size=536870912
# default path
--inferservice_path=conf
--inferservice_file=infer_service.prototxt
--resource_path=conf
--resource_file=resource.prototxt
--workflow_path=conf
--workflow_file=workflow.prototxt
```
各项参数的描述及默认值详见下表:
| name | Default | Description |
|------|--------|------|
|precision|"fp32"|Precision Mode, support FP32, FP16, INT8|
|use_calib|False|Only for deployment with TensorRT|
|reload_interval_s|10|Reload interval|
|max_concurrency|0|Limit of request processing in parallel, 0: unlimited|
|num_threads|10|Number of brpc service thread|
|bthread_concurrency|10|Number of bthread|
|max_body_size|536870912|Max size of brpc message|
|inferservice_path|"conf"|Path of inferservice conf|
|inferservice_file|"infer_service.prototxt"|Filename of inferservice conf|
|resource_path|"conf"|Path of resource conf|
|resource_file|"resource.prototxt"|Filename of resource conf|
|workflow_path|"conf"|Path of workflow conf|
|workflow_file|"workflow.prototxt"|Filename of workflow conf|
2. service.prototxt
service.prototxt 用于配置 Paddle Serving 实例挂载的 service 列表。通过 `--inferservice_path``--inferservice_file` 指定加载路径。protobuf 格式可参考 `core/configure/server_configure.protobuf``InferServiceConf`。示例如下:
```
port: 8010
services {
name: "GeneralModelService"
workflows: "workflow1"
}
```
其中:
- port: 用于配置 Serving 实例监听的端口号。
- services: 使用默认配置即可,不可修改。name 指定 service 名称,workflow1 的具体定义在 workflow.prototxt
3. workflow.prototxt
workflow.prototxt 用来描述具体的 workflow。通过 `--workflow_path``--workflow_file` 指定加载路径。protobuf 格式可参考 `configure/server_configure.protobuf``Workflow` 类型。自定义 OP 请参考 [自定义OP]()
如下示例,workflow 由3个 OP 构成,GeneralReaderOp 用于读取数据,GeneralInferOp 依赖于 GeneralReaderOp 并进行预测,GeneralResponseOp 将预测结果返回:
```
workflows {
name: "workflow1"
workflow_type: "Sequence"
nodes {
name: "general_reader_0"
type: "GeneralReaderOp"
}
nodes {
name: "general_infer_0"
type: "GeneralInferOp"
dependencies {
name: "general_reader_0"
mode: "RO"
}
}
nodes {
name: "general_response_0"
type: "GeneralResponseOp"
dependencies {
name: "general_infer_0"
mode: "RO"
}
}
}
```
其中:
- name: workflow 名称,用于从 service.prototxt 索引到具体的 workflow
- workflow_type: 只支持 "Sequence"
- nodes: 用于串联成 workflow 的所有节点,可配置多个 nodes。nodes 间通过配置 dependencies 串联起来
- node.name: 与 node.type 一一对应,具体可参考 `python/paddle_serving_server/dag.py`
- node.type: 当前 node 所执行 OP 的类名称,与 serving/op/ 下每个具体的 OP 类的名称对应
- node.dependencies: 依赖的上游 node 列表
- node.dependencies.name: 与 workflow 内节点的 name 保持一致
- node.dependencies.mode: RO-Read Only, RW-Read Write
4. resource.prototxt
resource.prototxt,用于指定模型配置文件。通过 `--resource_path``--resource_file` 指定加载路径。它的 protobuf 格式参考 `core/configure/proto/server_configure.proto``ResourceConf`。示例如下:
```
model_toolkit_path: "conf"
model_toolkit_file: "general_infer_0/model_toolkit.prototxt"
general_model_path: "conf"
general_model_file: "general_infer_0/general_model.prototxt"
```
其中:
- model_toolkit_path: 用来指定 model_toolkit.prototxt 所在的目录
- model_toolkit_file: 用来指定 model_toolkit.prototxt 所在的文件名
- general_model_path: 用来指定 general_model.prototxt 所在的目录
- general_model_file: 用来指定 general_model.prototxt 所在的文件名
5. model_toolkit.prototxt
用来配置模型信息和预测引擎。它的 protobuf 格式参考 `core/configure/proto/server_configure.proto` 的 ModelToolkitConf。model_toolkit.protobuf 的磁盘路径不能通过命令行参数覆盖。示例如下:
```
engines {
name: "general_infer_0"
type: "PADDLE_INFER"
reloadable_meta: "uci_housing_model/fluid_time_file"
reloadable_type: "timestamp_ne"
model_dir: "uci_housing_model"
gpu_ids: -1
enable_memory_optimization: true
enable_ir_optimization: false
use_trt: false
use_lite: false
use_xpu: false
use_gpu: false
combined_model: false
gpu_multi_stream: false
use_ascend_cl: false
runtime_thread_num: 0
batch_infer_size: 32
enable_overrun: false
allow_split_request: true
}
```
其中
- name: 引擎名称,与 workflow.prototxt 中的 node.name 以及所在目录名称对应
- type: 预测引擎的类型。当前只支持 ”PADDLE_INFER“
- reloadable_meta: 目前实际内容无意义,用来通过对该文件的 mtime 判断是否超过 reload 时间阈值
- reloadable_type: 检查 reload 条件:timestamp_ne/timestamp_gt/md5sum/revision/none,详见表格
- model_dir: 模型文件路径
- gpu_ids: 引擎运行时使用的 GPU device id,支持指定多个,如:
- enable_memory_optimization: 是否开启 memory 优化
- enable_ir_optimization: 是否开启 ir 优化
- use_trt: 是否开启 TensorRT,需同时开启 use_gpu
- use_lite: 是否开启 PaddleLite
- use_xpu: 是否使用昆仑 XPU
- use_gpu: 是否使用 GPU
- combined_model: 是否使用组合模型文件
- gpu_multi_stream: 是否开启 gpu 多流模式
- use_ascend_cl: 是否使用昇腾,单独开启适配昇腾 910,同时开启 lite 适配 310
- runtime_thread_num: 若大于 0, 则启用 Async 异步模式,并创建对应数量的 predictor 实例。
- batch_infer_size: Async 异步模式下的最大 batch 数
- enable_overrun: Async 异步模式下总是将整个任务放入任务队列
- allow_split_request: Async 异步模式下允许拆分任务
|reloadable_type|含义|
|---------------|----|
|timestamp_ne|reloadable_meta 所指定文件的 mtime 时间戳发生变化|
|timestamp_gt|reloadable_meta 所指定文件的 mtime 时间戳大于等于上次检查时记录的 mtime 时间戳|
|md5sum|目前无用,配置后永远不 reload|
|revision|目前无用,配置后用于不 reload|
6. general_model.prototxt
general_model.prototxt 内容与模型配置 serving_server_conf.prototxt 相同,用了描述模型输入输出参数信息。
## Python Pipeline
**一. 快速启动与关闭**
Python Pipeline 启动脚本如下,脚本实现请参考[Pipeline Serving]():
```BASH
python3 web_service.py
```
当您想要关闭 Serving 服务时(在 Pipeline 启动目录下或环境变量 SERVING_HOME 路径下,执行以下命令)可以如下命令,
stop 参数发送 SIGINT 至 Pipeline Serving,若 Linux 系统中改成 kill 则发送 SIGKILL 信号至 Pipeline Serving
```BASH
python3 -m paddle_serving_server.serve stop
```
**二. 配置文件**
Python Pipeline 提供了用户友好的多模型组合服务编程框架,适用于多模型组合应用的场景。
其配置文件为 YAML 格式,一般默认为 config.yaml。示例如下:
```YAML
rpc_port: 18090
http_port: 9999
worker_num: 20
build_dag_each_worker: false
```
| Argument | Type | Default | Description |
| ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
| `rpc_port` | int | `18090` | rpc 端口, rpc_port 和 http_port 不允许同时为空。当 rpc_port 为空且 http_port 不为空时,会自动将 rpc_port 设置为 http_port+1 |
| `http_port` | int| `9999` | http 端口, rpc_port 和 http_port 不允许同时为空。当 rpc_port 可用且 http_port 为空时,不自动生成 http_port |
| `worker_num` | int| `20` | worker_num, 最大并发数。当 build_dag_each_worker=True 时, 框架会创建w orker_num 个进程,每个进程内构建 grpcSever和DAG,当 build_dag_each_worker=False 时,框架会设置主线程 grpc 线程池的 max_workers=worker_num |
| `build_dag_each_worker` | bool| `false` | False,框架在进程内创建一条 DAG;True,框架会每个进程内创建多个独立的 DAG |
```YAML
dag:
is_thread_op: False
retry: 1
use_profile: false
tracer:
interval_s: 10
client_type: local_predictor
channel_size: 0
channel_recv_frist_arrive: False
```
| Argument | Type | Default | Description |
| ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
| `is_thread_op` | bool | `false` | op 资源类型, True, 为线程模型;False,为进程模型 |
| `retry` | int | `1` | 重试次数 |
| `use_profile` | bool | `false` | 使用性能分析, True,生成 Timeline 性能数据,对性能有一定影响;False 为不使用 |
| `tracer:interval_s` | int | `10 ` | rpc 端口, rpc_port 和 http_port 不允许同时为空。当 rpc_port 为空且 http_port 不为空时,会自动将 rpc_port 设置为 http_port+1 |
| `client_type` | string | `local_predictor` | client 类型,包括 brpc, grpc 和 local_predictor.local_predictor 不启动 Serving 服务,进程内预测 |
| `channel_size` | int | `0` | channel 的最大长度,默认为0 |
| `channel_recv_frist_arrive` | bool | `false` | 针对大模型分布式场景 tensor 并行,接收第一个返回结果后其他结果丢弃来提供速度 |
```YAML
op:
op1:
#并发数,is_thread_op=True 时,为线程并发;否则为进程并发
concurrency: 6
#Serving IPs
#server_endpoints: ["127.0.0.1:9393"]
#Fetch 结果列表,以 client_config 中 fetch_var 的 alias_name 为准
#fetch_list: ["concat_1.tmp_0"]
#det 模型 client 端配置
#client_config: serving_client_conf.prototxt
#Serving 交互超时时间, 单位 ms
#timeout: 3000
#Serving 交互重试次数,默认不重试
#retry: 1
# 批量查询 Serving 的数量, 默认 1。batch_size>1 要设置 auto_batching_timeout,否则不足 batch_size 时会阻塞
#batch_size: 2
# 批量查询超时,与 batch_size 配合使用
#auto_batching_timeout: 2000
#当 op 配置没有 server_endpoints 时,从 local_service_conf 读取本地服务配置
local_service_conf:
#client 类型,包括 brpc, grpc 和 local_predictor.local_predictor 不启动 Serving 服务,进程内预测
client_type: local_predictor
#det 模型路径
model_config: ocr_det_model
#Fetch 结果列表,以 client_config 中 fetch_var 的 alias_name 为准
fetch_list: ["concat_1.tmp_0"]
# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu, 5=arm ascend310, 6=arm ascend910
device_type: 0
#计算硬件 ID,当 devices 为""或不写时为 CPU 预测;当 devices 为 "0", "0,1,2" 时为 GPU 预测,表示使用的 GPU 卡
devices: ""
#use_mkldnn, 开启 mkldnn 时,必须同时设置 ir_optim=True,否则无效
#use_mkldnn: True
#ir_optim, 开启 TensorRT 时,必须同时设置 ir_optim=True,否则无效
ir_optim: True
#CPU 计算线程数,在 CPU 场景开启会降低单次请求响应时长
#thread_num: 10
#precsion, 预测精度,降低预测精度可提升预测速度
#GPU 支持: "fp32"(default), "fp16", "int8";
#CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8"
precision: "fp32"
#mem_optim, memory / graphic memory optimization
#mem_optim: True
#use_calib, Use TRT int8 calibration
#use_calib: False
#The cache capacity of different input shapes for mkldnn
#mkldnn_cache_capacity: 0
#mkldnn_op_list, op list accelerated using MKLDNN, None default
#mkldnn_op_list: []
#mkldnn_bf16_op_list,op list accelerated using MKLDNN bf16, None default.
#mkldnn_bf16_op_list: []
#min_subgraph_size,the minimal subgraph size for opening tensorrt to optimize, 3 default
#min_subgraph_size: 3
```
| Argument | Type | Default | Description |
| ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
| `concurrency` | int | `6` | 并发数,is_thread_op=True 时,为线程并发;否则为进程并发 |
| `server_endpoints` | list | `-` | 服务 IP 列表 |
| `fetch_list` | list | `-` | Fetch 结果列表,以 client_config 中 fetch_var 的 alias_name 为准 |
| `client_config` | string | `-` | 模型 client 端配置 |
| `timeout` | int | `3000` | Serving 交互超时时间, 单位 ms |
| `retry` | int | `1` | Serving 交互重试次数,默认不重试 |
| `batch_size` | int | `1` | 批量查询 Serving 的数量, 默认 1。batch_size>1 要设置 auto_batching_timeout,否则不足 batch_size 时会阻塞 |
| `auto_batching_timeout` | int | `2000` | 批量查询超时,与 batch_size 配合使用 |
| `local_service_conf` | map | `-` | 当 op 配置没有 server_endpoints 时,从 local_service_conf 读取本地服务配置 |
| `client_type` | string | `-` | client 类型,包括 brpc, grpc 和 local_predictor.local_predictor 不启动 Serving 服务,进程内预测 |
| `model_config` | string | `-` | 模型路径 |
| `fetch_list` | list | `-` | Fetch 结果列表,以 client_config 中 fetch_var 的 alias_name 为准 |
| `device_type` | int | `0` | 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu, 5=arm ascend310, 6=arm ascend910 |
| `devices` | string | `-` | 计算硬件 ID,当 devices 为""或不写时为 CPU 预测;当 devices 为 "0", "0,1,2" 时为 GPU 预测,表示使用的 GPU 卡 |
| `use_mkldnn` | bool | `True` | use_mkldnn, 开启 mkldnn 时,必须同时设置 ir_optim=True,否则无效|
| `ir_optim` | bool | `True` | 开启 TensorRT 时,必须同时设置 ir_optim=True,否则无效 |
| `thread_num` | int | `10` | CPU 计算线程数,在 CPU 场景开启会降低单次请求响应时长|
| `precision` | string | `fp32` | 预测精度,降低预测精度可提升预测速度,GPU 支持: "fp32"(default), "fp16", "int8";CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8" |
| `mem_optim` | bool | `True` | 内存优化选项 |
| `use_calib` | bool | `False` | TRT int8 量化校准模型 |
| `mkldnn_cache_capacity` | int | `0` | mkldnn 的不同输入尺寸缓存大小 |
| `mkldnn_op_list` | list | `-` | mkldnn 加速的 op 列表 |
| `mkldnn_bf16_op_list` | list | `-` | mkldnn bf16 加速的 op 列表 |
| `min_subgraph_size` | int | `3` | 开启 tensorrt 优化的最小子图大小 |
**三. 进阶参数配置**
更多进阶参数配置介绍,可以参照下表:如单机多卡推理、异构硬件、低精度推理等请参考[Pipeline Serving 典型示例]()
| 特性 | 文档 |
| ---- | ---- |
| 单机多卡推理| [Pipeline Serving]() |
| 异构硬件| [Pipeline Serving]() |
| 低精度推理| [Pipeline Serving]() |
\ No newline at end of file
# 进阶 C++ Serving 介绍
## 概述
本文将对 C++ Serving 除基本功能之外的高级特性、性能调优等问题进行介绍和说明,本文适合以下用户:
- 想要全面了解 C++ Serving 源码
- 想要了解模型热加载、A/B Test、加密模型推理服务等高级特性
- 通过修改 C++ Serving 参数进行性能调优
## 协议
当您需要自行组装 Request 请求中的数据或者需要二次开发时,您可以参考[相关文档]()。
## 模型热加载
当您需要在 Server 端不停止的情况下更新模型时,您可以参考[相关文档]()。
## A/B Test
当您需要将用户的请求按照一定的流量比例发送到不同的 Server 端时,您可以参考[相关文档]()。
## 加密模型推理服务
当您需要将模型加密部署到 Server 端时,您可以参考[相关文档]()。
## 多模型串联
当您需要将多个模型串联在同一个 Server 中部署时(例如 OCR 需要串联 Det 和 Rec),您可以参考该部分内容。
## 性能优化指南
当您想要对 C++ Serving 服务端进行性能调优时,您可以参考[相关文档]()。
## 性能指标
当您想要了解 C++ Serving 与竞品的性能对比数据时,您可以参考[相关文档]()。
# C++ Serving 异步模式
- [设计方案](#1)
- [网络同步线程](#1.1)
- [异步调度线程](#1.2)
- [动态批量](#1.3)
- [使用案例](#2)
- [开启同步模式](#2.1)
- [开启异步模式](#2.2)
- [性能测试](#3)
- [测试结果](#3.1)
- [测试数据](#3.2)
<a name="1"></a>
## 设计方案
<a name="1.1"></a>
**一.同步网络线程**
Paddle Serving 的网络框架层面是同步处理模式,即 bRPC 网络处理线程从系统内核拿到完整请求数据后( epoll 模式),在同一线程内完成业务处理,C++ Serving 默认使用同步模式。同步模式比较简单直接,适用于模型预测时间短,或单个 Request 请求批量较大的情况。
<p align="center">
<img src='../images/syn_mode.png' width = "350" height = "300">
<p>
Server 端线程数 N = 模型预测引擎数 N = 同时处理 Request 请求数 N,超发的 Request 请求需要等待当前线程处理结束后才能得到响应和处理。
<a name="1.2"></a>
**二.异步调度线程**
为了提高计算芯片吞吐和计算资源利用率,C++ Serving 在调度层实现异步多线程并发合并请求,实现动态批量推理。异步模型主要适用于模型支持批量,单个 Request 请求的无批量或较小,单次预测时间较长的情况。
<p align="center">
<img src='../images/asyn_mode.png'>
<p>
异步模式下,Server 端 N 个线程只负责接收 Request 请求,实际调用预测引擎是在异步框架的线程池中,异步框架的线程数可以由配置选项来指定。为了方便理解,我们假设每个 Request 请求批量均为1,此时异步框架会尽可能多得从请求池中取 n(n≤M)个 Request 并将其拼装为1个 Request(batch=n),调用1次预测引擎,得到1个 Response(batch = n),再将其对应拆分为 n 个 Response 作为返回结果。
<a name="1.3"></a>
**三.动态批量**
通常,异步框架合并多个请求的前提是所有请求的 `feed var` 的维度除 batch 维度外必须是相同的。例如,以 OCR 文字识别案例中检测模型为例,A 请求的 `x` 变量的 shape 是 [1, 3, 960, 960],B 请求的 `x` 变量的 shape 是 [2, 3, 960, 960],虽然第一个维度值不相同,但第一个维度属于 `batch` 维度,因此,请求 A 和 请求 B 可以合并。C 请求的 `x` 变量的 shape 是 [1, 3, 640, 480],由于除了 `batch` 维度外还有2个维度值不同,A 和 C 不能直接合并。
从经验来看,当2个请求的同一个变量 shape 维度的数量相等时,通过 `padding` 补0的方式按最大 shape 值对齐即可。即 C 请求的 shape 补齐到 [1, 3, 960, 960],那么就可以与 A 和 B 请求合并了。Paddle Serving 框架实现了动态 Padding 功能补齐 shape。
当多个将要合并的请求中有一个 shape 值很大时,所有请求的 shape 都要按最大补齐,导致计算量成倍增长。Paddle Serving 设计了一套合并策略,满足任何一个条件均可合并:
- 条件 1:绝对值差的字节数小于 **1024** 字节,评估补齐绝对长度
- 条件 2:相似度的乘积大于 **50%**,评估相似度,评估补齐绝对值整体数据量比例
场景1:`Shape-1 = [batch, 500, 500], Shape-2 = [batch, 400, 400]`。此时,`绝对值差 = 500*500 - 400*400 = 90000` 字节,`相对误差= (400/500) * (400/500) = 0.8*0.8 = 0.64`,满足条件1,不满足条件2,触发动态 Padding。
场景2:`Shape-1 = [batch, 1, 1], Shape-2 = [batch, 2, 2]`。此时,`绝对值差 = 2*2 - 1*1 = 3`字节,`相对误差 = (1/2) * (1/2) = 0.5*0.5 = 0.25`,满足条件2,不满足条件1,触发动态 Padding。
场景3:`Shape-1 = [batch, 3, 320, 320], Shape-2 = [batch, 3, 960, 960]`。此时,`绝对值差 = 3*960*960 - 3*320*320 = 2457600`字节,`相对误差 = (3/3) * (320/960) * (320/960) = 0.3*0.3 = 0.09`,条件1和条件2均不满足,未触发动态 Padding。
<a name="2"></a>
## 使用案例
<a name="2.1"></a>
**一.开启同步模式**
启动命令不使用 `--runtime_thread_num``--batch_infer_size` 时,属于同步处理模式,未开启异步模式。`--thread 16` 表示启动16个同步网络处理线程。
```
python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --port 9292
```
<a name="2.2"></a>
**二.开启异步模式**
启动命令使用 `--runtime_thread_num 2``--batch_infer_size 32` 开启异步模式,Serving 框架会启动2个异步线程,单次合并最大批量为32,自动开启动态 Padding。
```
python3 -m paddle_serving_server.serve --model uci_housing_model --thread 16 --port 9292 --runtime_thread_num 4 --batch_infer_size 32 --ir_optim --gpu_multi_stream --gpu_ids 0
```
<a name="3"></a>
## 性能测试
- GPU:Tesla P4 7611 MiB
- CUDA:cuda11.2-cudnn8-trt8
- Python 版本:python3.7
- 模型:ResNet_v2_50
- 测试数据:构造全1输入,单client请求100次,shape 范围(1, 224 ± 50, 224 ± 50)
同步模式启动命令:
```
python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --thread 8 --ir_optim --gpu_multi_stream --gpu_ids 1 --enable_prometheus --prometheus_port 1939
```
异步模式启动命令:
```
python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --thread 64 --runtime_thread_num 8 --ir_optim --gpu_multi_stream --gpu_ids 1 --enable_prometheus --prometheus_port 19393
```
<a name="3.1"></a>
**一.测试结果**
使用异步模式,并开启动态批量后,并发测试不同 shape 数据时,吞吐性能大幅提升。
<div align=center>
<img src='images/6-1_Cpp_Asynchronous_Framwork_CN_1.png' height = "600" align="middle"/>
</div
由于动态批量导致响应时长增长,经过测试,大多数场景下吞吐增量高于响应时长增长,尤其在高并发场景(client=70时),在响应时长增长 33% 情况下,吞吐增加 105%。
|Client |1 |5 |10 | 20 |30 |40 |50 |70 |
|---|---|---|---|---|---|---|---|---|
|QPS |-2.08% |-7.23% |-1.89% |20.55% |23.02% |23.34% |46.41% |105.27% |
|响应时长 | 2.70% |7.09% |5.24% |13.34% |10.80% |43.60% |8.72% |33.89% |
异步模式可有效提升服务吞吐性能。
<a name="3.2"></a>
**二.测试数据**
1. 同步模式
| client_num | batch_size |CPU_util_pre(%) |CPU_util(%) |GPU_memory(mb) |GPU_util(%) |qps(samples/s) |total count |mean(ms) |median(ms) |80 percent(ms) |90 percent(ms) |99 percent(ms) |total cost(s) |each cost(s)|infer_count_total|infer_cost_total(ms)|infer_cost_avg(ms)|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 |1 |1.30 |18.90 |2066 |71.56 |22.938 |100 |43.594 |23.516 |78.118 |78.323 |133.544 |4.4262 |4.3596 |7100.0000 |1666392.70 | 41.1081 |
| 5 |1 |2.00 |28.20 |3668 |92.57 |33.630 |500 |148.673 |39.531 |373.231 |396.306 |419.088 |15.0606 |14.8676 |7600.0000 |1739372.7480| 145.9601 |
|10 |1 |1.90 |29.80 |4202 |91.98 |34.303 |1000 |291.512 |76.728 |613.963 |632.736 |1217.863 |29.8004 |29.1516 |8600.0000 |1974147.7420| 234.7750 |
|20 |1 |4.70 |49.60 |4736 |92.63 |34.359 |2000 |582.089 |154.952 |1239.115 |1813.371 |1858.128 |59.7303 |58.2093 |12100.0000 |2798459.6330 |235.6248 |
|30 |1 |5.70 |65.70 |4736 |92.60 |34.162 |3000 |878.164 |231.121 |2391.687 |2442.744 |2499.963 |89.6546 |87.8168 |17600.0000 |4100408.9560 |236.6877 |
|40 |1 |5.40 |74.40 |5270 |92.44 |34.090 |4000 |1173.373 |306.244 |3037.038 |3070.198 |3134.894 |119.4162 |117.3377 |21600.0000 |5048139.2170 |236.9326|
|50 |1 |1.40 |64.70 |5270 |92.37 |34.031 |5000 |1469.250 |384.327 |3676.812 |3784.330 |4366.862 |149.7041 |146.9254 |26600.0000 |6236269.4230 |237.6260|
|70 |1 |3.70 |79.70 |5270 |91.89 |33.976 |7000 |2060.246 |533.439 |5429.255 |5552.704 |5661.492 |210.1008 |206.0250 |33600.0000 |7905005.9940 |238.3909|
2. 异步模式 - 未开启动态批量
| client_num | batch_size |CPU_util_pre(%) |CPU_util(%) |GPU_memory(mb) |GPU_util(%) |qps(samples/s) |total count |mean(ms) |median(ms) |80 percent(ms) |90 percent(ms) |99 percent(ms) |total cost(s) |each cost(s)|infer_count_total|infer_cost_total(ms)|infer_cost_avg(ms)|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 |1 |6.20 |13.60 |5170 |71.11 |22.894 |100 |43.677 |23.992 |78.285 |78.788 |123.542 |4.4253 |4.3679 |3695.0000 |745061.9120 |40.6655 |
| 5 |1 |6.10 |32.20 |7306 |89.54 |33.532 |500 |149.109 |43.906 |376.889 |401.999 |422.753 |15.1623 |14.9113 |4184.0000 |816834.2250 |146.7736|
|10 |1 |4.90 |43.60 |7306 |91.55 |38.136 |1000 |262.216 |75.393 |575.788 |632.016 |1247.775 |27.1019 |26.2220 |5107.0000 |1026490.3950 |227.1464|
|20 |1 |5.70 |39.60 |7306 |91.36 |58.601 |2000 |341.287 |145.774 |646.824 |994.748 |1132.979 |38.3915 |34.1291 |7461.0000 |1555234.6260 |229.9113|
|30 |1 |1.30 |45.40 |7484 |91.10 |69.008 |3000 |434.728 |204.347 |959.184 |1092.181 |1661.289 |46.3822 |43.4732 |10289.0000 |2269499.9730 |249.4257|
|40 |1 |3.10 |73.00 |7562 |91.83 |80.956 |4000 |494.091 |272.889 |966.072 |1310.011 |1851.887 |52.0609 |49.4095 |12102.0000 |2678878.2010 |225.8016|
|50 |1 |0.80 |68.00 |7522 |91.10 |83.018 |5000 |602.276 |364.064 |1058.261 |1473.051 |1671.025 |72.9869 |60.2280 |14225.0000 |3256628.2820 |272.1385|
|70 |1 |6.10 |78.40 |7584 |92.02 |65.069 |7000 |1075.777 |474.014 |2411.296 |2705.863 |3409.085 |111.6653 |107.5781 |17974.0000 |4139377.4050 |235.4626
3. 异步模式 - 开启动态批量
| client_num | batch_size |CPU_util_pre(%) |CPU_util(%) |GPU_memory(mb) |GPU_util(%) |qps(samples/s) |total count |mean(ms) |median(ms) |80 percent(ms) |90 percent(ms) |99 percent(ms) |total cost(s) |each cost(s)|infer_count_total|infer_cost_total(ms)|infer_cost_avg(ms)|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 |1 |1.20 |13.30 |6048 |70.07 |22.417 |100 |44.606 |24.486 |78.365 |78.707 |139.349 |4.5201 |4.4608 |1569.0000 |462418.6390 |41.7646 |
| 5 |1 |1.20 |50.80 |7116 |87.37 |31.106 |500 |160.740 |42.506 |414.903 |458.841 |481.112 |16.3525 |16.0743 |2059.0000 |539439.3300 |157.1851
|10 |1 |0.80 |26.20 |7264 |88.74 |37.417 |1000 |267.254 |79.452 |604.451 |686.477 |1345.528 |27.9848 |26.7258 |2950.0000 |752428.0570 |239.0446|
|20 |1 |1.50 |32.80 |7264 |89.52 |70.641 |2000 |283.117 |133.441 |516.066 |652.089 |1274.957 |33.0280 |28.3121 |4805.0000 |1210814.5610 |260.5873|
|30 |1 |0.90 |59.10 |7348 |89.57 |84.894 |3000 |353.380 |217.385 |613.587 |757.829 |1277.283 |40.7093 |35.3384 |6924.0000 |1817515.1710 |276.3695|
|40 |1 |1.30 |57.30 |7356 |89.30 |99.853 |4000 |400.584 |204.425 |666.015 |1031.186 |1380.650 |49.4807 |40.0588 |8104.0000 |2200137.0060 |324.2558|
|50 |1 |1.50 |50.60 |7578 |89.04 |121.545 |5000 |411.364 |331.118 |605.809 |874.543 |1285.650 |48.2343 |41.1369 |9350.0000 |2568777.6400 |295.8593|
|70 |1 |3.80 |83.20 |7602 |89.59 |133.568 |7000 |524.073 |382.653 |799.463 |1202.179 |1576.809 |57.2885 |52.4077 |10761.0000 |3013600.9670 |315.2540|
# Inference Protocols
C++ Serving 基于 BRPC 进行服务构建,支持 BRPC、GRPC、RESTful 请求。请求数据为 protobuf 格式,详见 `core/general-server/proto/general_model_service.proto`。本文介绍构建请求以及解析结果的方法。
## Tensor
**一.Tensor 定义**
Tensor 可以装载多种类型的数据,是 Request 和 Response 的基础单元。Tensor 的具体定义如下:
```protobuf
message Tensor {
// VarType: INT64
repeated int64 int64_data = 1;
// VarType: FP32
repeated float float_data = 2;
// VarType: INT32
repeated int32 int_data = 3;
// VarType: FP64
repeated double float64_data = 4;
// VarType: UINT32
repeated uint32 uint32_data = 5;
// VarType: BOOL
repeated bool bool_data = 6;
// (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
// represents the imaginary part
repeated float complex64_data = 7;
// (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
// represents the imaginary part
repeated double complex128_data = 8;
// VarType: STRING
repeated string data = 9;
// Element types:
// 0 => INT64
// 1 => FP32
// 2 => INT32
// 3 => FP64
// 4 => INT16
// 5 => FP16
// 6 => BF16
// 7 => UINT8
// 8 => INT8
// 9 => BOOL
// 10 => COMPLEX64
// 11 => COMPLEX128
// 20 => STRING
int32 elem_type = 10;
// Shape of the tensor, including batch dimensions.
repeated int32 shape = 11;
// Level of data(LOD), support variable length data, only for fetch tensor
// currently.
repeated int32 lod = 12;
// Correspond to the variable 'name' in the model description prototxt.
string name = 13;
// Correspond to the variable 'alias_name' in the model description prototxt.
string alias_name = 14; // get from the Model prototxt
// VarType: FP16, INT16, INT8, BF16, UINT8
bytes tensor_content = 15;
};
```
- elem_type:数据类型,当前支持 FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
|elem_type|类型|
|---------|----|
|0|INT64|
|1|FLOAT32|
|2|INT32|
|3|FP64|
|4|INT16|
|5|FP16|
|6|BF16|
|7|UINT8|
|8|INT8|
- shape:数据维度
- lod:lod 信息,LoD(Level-of-Detail) Tensor 是 Paddle 的高级特性,是对 Tensor 的一种扩充,用于支持更自由的数据输入。Lod 相关原理介绍,请参考[相关文档](../LOD_CN.md)
- name/alias_name: 名称及别名,与模型配置对应
**二.构建 Tensor 数据**
1. FLOAT32 类型 Tensor
```C
// 原始数据
std::vector<float> float_data;
Tensor *tensor = new Tensor;
// 设置维度,可以设置多维
for (uint32_t j = 0; j < float_shape.size(); ++j) {
tensor->add_shape(float_shape[j]);
}
// 设置 LOD 信息
for (uint32_t j = 0; j < float_lod.size(); ++j) {
tensor->add_lod(float_lod[j]);
}
// 设置类型、名称及别名
tensor->set_elem_type(1);
tensor->set_name(name);
tensor->set_alias_name(alias_name);
// 拷贝数据
int total_number = float_data.size();
tensor->mutable_float_data()->Resize(total_number, 0);
memcpy(tensor->mutable_float_data()->mutable_data(), float_datadata(), total_number * sizeof(float));
```
2. INT8 类型 Tensor
```C
// 原始数据
std::string string_data;
Tensor *tensor = new Tensor;
for (uint32_t j = 0; j < string_shape.size(); ++j) {
tensor->add_shape(string_shape[j]);
}
for (uint32_t j = 0; j < string_lod.size(); ++j) {
tensor->add_lod(string_lod[j]);
}
tensor->set_elem_type(8);
tensor->set_name(name);
tensor->set_alias_name(alias_name);
tensor->set_tensor_content(string_data);
```
## Request
**一.Request 定义**
Request 为客户端需要发送的请求数据,其以 Tensor 为基础数据单元,并包含了额外的请求信息。定义如下:
```protobuf
message Request {
repeated Tensor tensor = 1;
repeated string fetch_var_names = 2;
bool profile_server = 3;
uint64 log_id = 4;
};
```
- fetch_vat_names: 需要获取的输出数据名称,在GeneralResponseOP会根据该列表进行过滤.请参考模型文件serving_client_conf.prototxt中的`fetch_var`字段下的`alias_name`
- profile_server: 调试参数,打开时会输出性能信息
- log_id: 请求ID
**二.构建 Request**
1. Protobuf 形式
当使用 BRPC 或 GRPC 进行请求时,使用 protobuf 形式数据,构建方式如下:
```C
Request req;
req.set_log_id(log_id);
for (auto &name : fetch_name) {
req.add_fetch_var_names(name);
}
// 添加Tensor
Tensor *tensor = req.add_tensor();
...
```
2. Json 形式
当使用 RESTful 请求时,可以使用 Json 形式数据,具体格式如下:
```Json
{"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"name":"x","alias_name":"x","shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}
```
## Response
**一.Response 定义**
Response 为服务端返回给客户端的结果,包含了 Tensor 数据、错误码、错误信息等。定义如下:
```protobuf
message Response {
repeated ModelOutput outputs = 1;
repeated int64 profile_time = 2;
// Error code
int32 err_no = 3;
// Error messages
string err_msg = 4;
};
message ModelOutput {
repeated Tensor tensor = 1;
string engine_name = 2;
}
```
- profile_time:当设置 request->set_profile_server(true) 时,会返回性能信息
- err_no:错误码,详见`core/predictor/common/constant.h`
- err_msg:错误信息,详见`core/predictor/common/constant.h`
- engine_name:输出节点名称
|err_no|err_msg|
|---------|----|
|0|OK|
|-5000|"Paddle Serving Framework Internal Error."|
|-5001|"Paddle Serving Memory Alloc Error."|
|-5002|"Paddle Serving Array Overflow Error."|
|-5100|"Paddle Serving Op Inference Error."|
**二.读取 Response 数据**
```C
uint32_t model_num = res.outputs_size();
for (uint32_t m_idx = 0; m_idx < model_num; ++m_idx) {
std::string engine_name = output.engine_name();
int idx = 0;
// 读取 tensor 维度
int shape_size = output.tensor(idx).shape_size();
for (int i = 0; i < shape_size; ++i) {
shape[i] = output.tensor(idx).shape(i);
}
// 读取 LOD 信息
int lod_size = output.tensor(idx).lod_size();
if (lod_size > 0) {
lod.resize(lod_size);
for (int i = 0; i < lod_size; ++i) {
lod[i] = output.tensor(idx).lod(i);
}
}
// 读取 float 数据
int size = output.tensor(idx).float_data_size();
float_data = std::vector<float>(
output.tensor(idx).float_data().begin(),
output.tensor(idx).float_data().begin() + size);
// 读取 int8 数据
string_data = output.tensor(idx).tensor_content();
}
```
# C++ Serving 通讯协议
- [网络框架](#0)
- [Tensor](#1)
- [1.1 构建 FLOAT32 Tensor](#1.1)
- [1.2 构建 STRING Tensor](#1.2)
- [Request](#2)
- [2.1 构建 Protobuf Request](#2.1)
- [2.2 构建 Json Request](#2.2)
- [Response](#3)
- [3.1 读取 Response 数据](#3.1)
<a name="0"></a>
## 网络框架
C++ Serving 基于 [bRPC](https://github.com/apache/incubator-brpc) 网络框架构建服务,支持 bRPC、gRPC 和 RESTful 协议请求。不限于开发语言和框架,甚至 `curl` 方式,只要按照上述协议封装数据并发送,Server 就能够接收、处理和返回结果。
对于支持的各种协议我们提供了部分的 Client SDK 示例供用户参考和使用,用户也可以根据自己的需求去开发新的 Client SDK,也欢迎用户添加其他语言/协议(例如 GRPC-Go、GRPC-C++ HTTP2-Go、HTTP2-Java 等)Client SDK 到我们的仓库供其他开发者借鉴和参考。
| 通信协议 | 速度 | 是否支持 | 是否提供Client SDK |
|-------------|-----|---------|-------------------|
| bRPC | 最快 | 支持 | [C++]、[Python(Pybind方式)] |
| HTTP 2.0 + Proto | 快 | 不支持 | |
| gRPC | 较快 | 支持 | [Java]、[Python] |
| HTTP 1.1 + Proto | 较快 | 支持 | [Java]、[Python] |
| HTTP 1.1 + Json | 慢 | 支持 | [Java]、[Python]、[Curl] |
C++ Serving 请求和应答的数据格式为 protobuf,重要的结构有以下3个:
<a name="1"></a>
## Tensor
[Tensor](https://github.com/PaddlePaddle/Serving/blob/develop/core/general-server/proto/general_model_service.proto#L22) 可以装载多种类型的数据,是 Request 和 Response 的基础单元。Tensor 的定义如下:
```protobuf
message Tensor {
// VarType: INT64
repeated int64 int64_data = 1;
// VarType: FP32
repeated float float_data = 2;
// VarType: INT32
repeated int32 int_data = 3;
// VarType: FP64
repeated double float64_data = 4;
// VarType: UINT32
repeated uint32 uint32_data = 5;
// VarType: BOOL
repeated bool bool_data = 6;
// (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
// represents the imaginary part
repeated float complex64_data = 7;
// (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
// represents the imaginary part
repeated double complex128_data = 8;
// VarType: STRING
repeated string data = 9;
// Element types:
// 0 => INT64
// 1 => FP32
// 2 => INT32
// 3 => FP64
// 4 => INT16
// 5 => FP16
// 6 => BF16
// 7 => UINT8
// 8 => INT8
// 9 => BOOL
// 10 => COMPLEX64
// 11 => COMPLEX128
// 20 => STRING
int32 elem_type = 10;
// Shape of the tensor, including batch dimensions.
repeated int32 shape = 11;
// Level of data(LOD), support variable length data, only for fetch tensor
// currently.
repeated int32 lod = 12;
// Correspond to the variable 'name' in the model description prototxt.
string name = 13;
// Correspond to the variable 'alias_name' in the model description prototxt.
string alias_name = 14; // get from the Model prototxt
// VarType: FP16, INT16, INT8, BF16, UINT8
bytes tensor_content = 15;
};
```
Tensor 结构中重要成员 `elem_type``shape``lod``name/alias_name`
- name/alias_name: 名称及别名,与模型配置对应
- elem_type:数据类型,当前支持FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
- shape:数据维度
- lod:变长结构 LoD(Level-of-Detail) Tensor 是 Paddle 的高级特性,是对 Tensor 的一种扩充,用于支持更自由的数据输入。详见[LOD](../LOD_CN.md)
|elem_type|类型|
|---------|----|
|0|INT64|
|1|FLOAT32|
|2|INT32|
|3|FP64|
|4|INT16|
|5|FP16|
|6|BF16|
|7|UINT8|
|8|INT8|
|9|BOOL|
|10|COMPLEX64|
|11|COMPLEX128
|20|STRING|
<a name="1.1"></a>
**一.构建 FLOAT32 Tensor**
创建 Tensor 对象,通过 `mutable_float_data::Resize()` 设置 FLOAT32 类型数据长度,通过 memcpy 函数拷贝数据。
```C
// 原始数据
std::vector<float> float_data;
Tensor *tensor = new Tensor;
// 设置维度,可以设置多维
for (uint32_t j = 0; j < float_shape.size(); ++j) {
tensor->add_shape(float_shape[j]);
}
// 设置LOD信息
for (uint32_t j = 0; j < float_lod.size(); ++j) {
tensor->add_lod(float_lod[j]);
}
// 设置类型、名称及别名
tensor->set_elem_type(1);
tensor->set_name(name);
tensor->set_alias_name(alias_name);
// 拷贝数据
int total_number = float_data.size();
tensor->mutable_float_data()->Resize(total_number, 0);
memcpy(tensor->mutable_float_data()->mutable_data(), float_data.data(), total_number * sizeof(float));
```
<a name="1.2"></a>
**二.构建 STRING Tensor**
创建 Tensor 对象,通过 `set_tensor_content` 设置 string 类型数据。
```C
// 原始数据
std::string string_data;
Tensor *tensor = new Tensor;
for (uint32_t j = 0; j < string_shape.size(); ++j) {
tensor->add_shape(string_shape[j]);
}
for (uint32_t j = 0; j < string_lod.size(); ++j) {
tensor->add_lod(string_lod[j]);
}
tensor->set_elem_type(8);
tensor->set_name(name);
tensor->set_alias_name(alias_name);
tensor->set_tensor_content(string_data);
```
<a name="2"></a>
## Request
Request 为客户端需要发送的请求数据,其以 Tensor 为基础数据单元,并包含了额外的请求信息。定义如下:
```protobuf
message Request {
repeated Tensor tensor = 1;
repeated string fetch_var_names = 2;
bool profile_server = 3;
uint64 log_id = 4;
};
```
- fetch_vat_names: 需要获取的输出数据名称,在 `GeneralResponseOP` 会根据该列表进行过滤.请参考模型文件 `serving_client_conf.prototxt` 中的 `fetch_var` 字段下的 `alias_name`
- profile_server: 调试参数,打开时会输出性能信息
- log_id: 请求ID
当使用 bRPC 或 gRPC 进行请求时,使用 protobuf 或 Json 格式请求数据。
<a name="2.1"></a>
**一.构建 Protobuf Request**
创建 Request 对象,通过 `add_tensor` 接口来设置 Tensor。
```C
Request req;
req.set_log_id(log_id);
for (auto &name : fetch_name) {
req.add_fetch_var_names(name);
}
// 添加Tensor
Tensor *tensor = req.add_tensor();
...
```
<a name="2.2"></a>
**二.构建 Json Request**
当使用 RESTful 请求时,可以使用 Json 格式数据,示例如下:
```JSON
{"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"name":"x","alias_name":"x","shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}
```
可参考示例,不用修改整体结构,仅需修改数据类型和数据。
<a name="3"></a>
## Response
Response 为服务端返回给客户端的结果,包含了 Tensor 数据、错误码、错误信息等。定义如下:
```protobuf
message Response {
repeated ModelOutput outputs = 1;
repeated int64 profile_time = 2;
// Error code
int32 err_no = 3;
// Error messages
string err_msg = 4;
};
message ModelOutput {
repeated Tensor tensor = 1;
string engine_name = 2;
}
```
Response 结构中核心成员:
- profile_time:当设置 `request->set_profile_server(true)` 时,会返回性能信息
- err_no:错误码
- err_msg:错误信息
- engine_name:输出节点名称
|err_no|err_msg|
|---------|----|
|0|OK|
|-5000|"Paddle Serving Framework Internal Error."|
|-5001|"Paddle Serving Memory Alloc Error."|
|-5002|"Paddle Serving Array Overflow Error."|
|-5100|"Paddle Serving Op Inference Error."|
<a name="3.1"></a>
**一.读取 Response 数据**
读取 Response 对象中 Tensor 数据示例如下
```C
Response res;
uint32_t model_num = res.outputs_size();
for (uint32_t m_idx = 0; m_idx < model_num; ++m_idx) {
std::string engine_name = output.engine_name();
int idx = 0;
// 读取tensor维度
int shape_size = output.tensor(idx).shape_size();
for (int i = 0; i < shape_size; ++i) {
shape[i] = output.tensor(idx).shape(i);
}
// 读取LOD信息
int lod_size = output.tensor(idx).lod_size();
if (lod_size > 0) {
lod.resize(lod_size);
for (int i = 0; i < lod_size; ++i) {
lod[i] = output.tensor(idx).lod(i);
}
}
// 读取float数据
int size = output.tensor(idx).float_data_size();
float_data = std::vector<float>(
output.tensor(idx).float_data().begin(),
output.tensor(idx).float_data().begin() + size);
// 读取int8数据
string_data = output.tensor(idx).tensor_content();
}
```
# Paddle Serving 中的模型热加载
## 背景
在实际的工业场景下,通常是远端定期不间断产出模型,线上服务端需要在服务不中断的情况下拉取新模型对旧模型进行更新迭代。
## Server Monitor
Paddle Serving 提供了一个自动监控脚本,远端地址更新模型后会拉取新模型更新本地模型,同时更新本地模型文件夹中的时间戳文件 `fluid_time_stamp` 实现热加载。
目前支持下面几种类型的远端监控 Monitor:
| Monitor类型 | 描述 | 特殊选项 |
| :---------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
| general | 远端无认证,可以通过 `wget` 直接访问下载文件(如无需认证的FTP,BOS等) | `general_host` 通用远端host |
| hdfs/afs(HadoopMonitor) | 远端为 HDFS 或 AFS,通过 Hadoop-Client 执行相关命令 | `hadoop_bin` Hadoop 二进制的路径 <br/>`fs_name` Hadoop fs_name,默认为空<br/>`fs_ugi` Hadoop fs_ugi,默认为空 |
| ftp | 远端为 FTP,通过 `ftplib` 进行相关访问(使用该 Monitor,您可能需要执行 `pip install ftplib` 下载 `ftplib`) | `ftp_host` FTP host<br>`ftp_port` FTP port<br>`ftp_username` FTP username,默认为空<br>`ftp_password` FTP password,默认为空 |
| Monitor通用选项 | 描述 | 默认值 |
| :--------------------: | :----------------------------------------------------------: | :--------------------: |
| `type` | 指定 Monitor 类型 | 无 |
| `remote_path` | 指定远端的基础路径 | 无 |
| `remote_model_name` | 指定远端需要拉取的模型名 | 无 |
| `remote_donefile_name` | 指定远端标志模型更新完毕的 donefile 文件名 | 无 |
| `local_path` | 指定本地工作路径 | 无 |
| `local_model_name` | 指定本地模型名 | 无 |
| `local_timestamp_file` | 指定本地用于热加载的时间戳文件,该文件被认为在 `local_path/local_model_name` 下。 | `fluid_time_file` |
| `local_tmp_path` | 指定本地存放临时文件的文件夹路径,若不存在则自动创建。 | `_serving_monitor_tmp` |
| `interval` | 指定轮询间隔时间,单位为秒。 | `10` |
| `unpacked_filename` | Monitor 支持 tarfile 打包的远程模型。如果远程模型是打包格式,则需要设置该选项来告知 Monitor 解压后的文件名。 | `None` |
| `debug` | 如果添加 `--debug` 选项,则输出更详细的中间信息。 | 默认不添加该选项 |
下面通过 HadoopMonitor 示例来展示 Paddle Serving 的模型热加载功能。
## HadoopMonitor 示例
示例中在 `product_path` 中生产模型上传至 hdfs,在 `server_path` 中模拟服务端模型热加载:
```shell
.
├── product_path
└── server_path
```
**一.生产模型**
`product_path` 下运行下面的 Python 代码生产模型(运行前需要修改 hadoop 相关的参数),每隔 60 秒会产出 Boston 房价预测模型的打包文件 `uci_housing.tar.gz` 并上传至 hdfs 的`/`路径下,上传完毕后更新时间戳文件 `donefile` 并上传至 hdfs 的`/`路径下。
```python
import os
import sys
import time
import tarfile
import paddle
import paddle.fluid as fluid
import paddle_serving_client.io as serving_io
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=16)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.test(), buf_size=500),
batch_size=16)
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_loss = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(avg_loss)
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
def push_to_hdfs(local_file_path, remote_path):
afs = 'afs://***.***.***.***:***' # User needs to change
uci = '***,***' # User needs to change
hadoop_bin = '/path/to/haddop/bin' # User needs to change
prefix = '{} fs -Dfs.default.name={} -Dhadoop.job.ugi={}'.format(hadoop_bin, afs, uci)
os.system('{} -rmr {}/{}'.format(
prefix, remote_path, local_file_path))
os.system('{} -put {} {}'.format(
prefix, local_file_path, remote_path))
name = "uci_housing"
for pass_id in range(30):
for data_train in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data_train),
fetch_list=[avg_loss])
# Simulate the production model every other period of time
time.sleep(60)
model_name = "{}_model".format(name)
client_name = "{}_client".format(name)
serving_io.save_model(model_name, client_name,
{"x": x}, {"price": y_predict},
fluid.default_main_program())
# Packing model
tar_name = "{}.tar.gz".format(name)
tar = tarfile.open(tar_name, 'w:gz')
tar.add(model_name)
tar.close()
# Push packaged model file to hdfs
push_to_hdfs(tar_name, '/')
# Generate donefile
donefile_name = 'donefile'
os.system('touch {}'.format(donefile_name))
# Push donefile to hdfs
push_to_hdfs(donefile_name, '/')
```
hdfs 上的文件如下列所示:
```bash
# hadoop fs -ls /
Found 2 items
-rw-r--r-- 1 root supergroup 0 2020-04-02 02:54 /donefile
-rw-r--r-- 1 root supergroup 2101 2020-04-02 02:54 /uci_housing.tar.gz
```
**二.服务端加载模型**
进入 `server_path` 文件夹。
1. 用初始模型启动 Server 端
这里使用预训练的 Boston 房价预测模型作为初始模型:
```shell
wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
tar -xzf uci_housing.tar.gz
```
启动 Server 端:
```shell
python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
```
2. 执行监控程序
用下面的命令来执行 HDFS 监控程序:
```shell
python -m paddle_serving_server.monitor \
--type='hdfs' --hadoop_bin='/hadoop-3.1.2/bin/hadoop' \
--remote_path='/' --remote_model_name='uci_housing.tar.gz' \
--remote_donefile_name='donefile' --local_path='.' \
--local_model_name='uci_housing_model' --local_timestamp_file='fluid_time_file' \
--local_tmp_path='_tmp' --unpacked_filename='uci_housing_model' --debug
```
上面代码通过轮询方式监控远程 HDFS 地址`/`的时间戳文件`/donefile`,当时间戳变更则认为远程模型已经更新,将远程打包模型`/uci_housing.tar.gz`拉取到本地临时路径`./_tmp/uci_housing.tar.gz`下,解包出模型文件`./_tmp/uci_housing_model`后,更新本地模型`./uci_housing_model`以及Paddle Serving的时间戳文件`./uci_housing_model/fluid_time_file`
预计输出如下:
```shell
2020-04-02 10:12 INFO [monitor.py:85] _hadoop_bin: /hadoop-3.1.2/bin/hadoop
2020-04-02 10:12 INFO [monitor.py:85] _fs_name:
2020-04-02 10:12 INFO [monitor.py:85] _fs_ugi:
2020-04-02 10:12 INFO [monitor.py:209] AFS prefix cmd: /hadoop-3.1.2/bin/hadoop fs
2020-04-02 10:12 INFO [monitor.py:85] _remote_path: /
2020-04-02 10:12 INFO [monitor.py:85] _remote_model_name: uci_housing.tar.gz
2020-04-02 10:12 INFO [monitor.py:85] _remote_donefile_name: donefile
2020-04-02 10:12 INFO [monitor.py:85] _local_model_name: uci_housing_model
2020-04-02 10:12 INFO [monitor.py:85] _local_path: .
2020-04-02 10:12 INFO [monitor.py:85] _local_timestamp_file: fluid_time_file
2020-04-02 10:12 INFO [monitor.py:85] _local_tmp_path: _tmp
2020-04-02 10:12 INFO [monitor.py:85] _interval: 10
2020-04-02 10:12 DEBUG [monitor.py:214] check cmd: /hadoop-3.1.2/bin/hadoop fs -ls /donefile 2>/dev/null
2020-04-02 10:12 DEBUG [monitor.py:216] resp: -rw-r--r-- 1 root supergroup 0 2020-04-02 10:11 /donefile
2020-04-02 10:12 INFO [monitor.py:138] doneilfe(donefile) changed.
2020-04-02 10:12 DEBUG [monitor.py:233] pull cmd: /hadoop-3.1.2/bin/hadoop fs -get /uci_housing.tar.gz _tmp/uci_housing.tar.gz 2>/dev/null
2020-04-02 10:12 INFO [monitor.py:144] pull remote model(uci_housing.tar.gz).
2020-04-02 10:12 INFO [monitor.py:98] unpack remote file(uci_housing.tar.gz).
2020-04-02 10:12 DEBUG [monitor.py:108] remove packed file(uci_housing.tar.gz).
2020-04-02 10:12 INFO [monitor.py:110] using unpacked filename: uci_housing_model.
2020-04-02 10:12 DEBUG [monitor.py:175] update model cmd: cp -r _tmp/uci_housing_model/* ./uci_housing_model
2020-04-02 10:12 INFO [monitor.py:152] update local model(uci_housing_model).
2020-04-02 10:12 DEBUG [monitor.py:184] update timestamp cmd: touch ./uci_housing_model/fluid_time_file
2020-04-02 10:12 INFO [monitor.py:157] update model timestamp(fluid_time_file).
2020-04-02 10:12 INFO [monitor.py:161] sleep 10s.
2020-04-02 10:12 DEBUG [monitor.py:214] check cmd: /hadoop-3.1.2/bin/hadoop fs -ls /donefile 2>/dev/null
2020-04-02 10:12 DEBUG [monitor.py:216] resp: -rw-r--r-- 1 root supergroup 0 2020-04-02 10:11 /donefile
2020-04-02 10:12 INFO [monitor.py:161] sleep 10s.
```
3. 查看 Server 日志
通过下面命令查看 Server 的运行日志:
```shell
tail -f log/serving.INFO
```
日志中显示模型已经被热加载:
```shell
I0330 09:38:40.087316 7361 server.cpp:150] Begin reload framework...
W0330 09:38:40.087399 7361 infer.h:656] Succ reload version engine: 18446744073709551615
I0330 09:38:40.087414 7361 manager.h:131] Finish reload 1 workflow(s)
I0330 09:38:50.087535 7361 server.cpp:150] Begin reload framework...
W0330 09:38:50.087641 7361 infer.h:250] begin reload model[uci_housing_model].
I0330 09:38:50.087972 7361 infer.h:66] InferEngineCreationParams: model_path = uci_housing_model, enable_memory_optimization = 0, static_optimization = 0, force_update_static_cache = 0
I0330 09:38:50.088027 7361 analysis_predictor.cc:88] Profiler is deactivated, and no profiling report will be generated.
I0330 09:38:50.088393 7361 analysis_predictor.cc:841] MODEL VERSION: 1.7.1
I0330 09:38:50.088413 7361 analysis_predictor.cc:843] PREDICTOR VERSION: 1.6.3
I0330 09:38:50.089519 7361 graph_pattern_detector.cc:96] --- detected 1 subgraphs
I0330 09:38:50.090925 7361 analysis_predictor.cc:470] ======= optimize end =======
W0330 09:38:50.090986 7361 infer.h:472] Succ load common model[0x7fc83c06abd0], path[uci_housing_model].
I0330 09:38:50.091022 7361 analysis_predictor.cc:88] Profiler is deactivated, and no profiling report will be generated.
W0330 09:38:50.091050 7361 infer.h:509] td_core[0x7fc83c0ad770] clone model from pd_core[0x7fc83c06abd0] succ, cur_idx[0].
...
W0330 09:38:50.091784 7361 infer.h:489] Succ load clone model, path[uci_housing_model]
W0330 09:38:50.091794 7361 infer.h:656] Succ reload version engine: 18446744073709551615
I0330 09:38:50.091820 7361 manager.h:131] Finish reload 1 workflow(s)
I0330 09:39:00.091987 7361 server.cpp:150] Begin reload framework...
W0330 09:39:00.092161 7361 infer.h:656] Succ reload version engine: 18446744073709551615
I0330 09:39:00.092177 7361 manager.h:131] Finish reload 1 workflow(s)
```
# C++ Serving ABTest
- [功能设计](#1)
- [使用案例](#2)
- [1.1 安装 Paddle Serving Wheels](#2.1)
- [1.2 下载多个模型并保存模型参数](#2.2)
- [1.3 启动 A,B,C 3个服务](#2.3)
- [1.4 客户端注册 A,B,C 服务端地址](#2.4)
- [1.5 启动客户端并验证结果](#2.5)
ABTest 是一种功能测试方案,一般是为同一个产品目标制定多种方案,让一部分用户使用 A 方案,另一部分用户使用 B 或 C 方案,根据测试效果,如点击率、转化率等来评价方案的优劣。
模型服务化部署框架中,ABTest 属于一个重要的基础功能,为模型迭代升级提供实验环境。Paddle Serving 的 PYTHON SDK 中实现 ABTest 功能,为用户提供简单易用功能测试环境。
<a name="1"></a>
## 功能设计
Paddle Serving 的 ABTest 功能是基于 PYTHON SDK 和 多个服务端构成。每个服务端加载不同模型,在客户端上注册多个服务端地址和访问比例,最终确定访问。
<div align=center>
<img src='images/6-5_Cpp_ABTest_CN_1.png' height = "400" align="middle"/>
</div
<a name="2"></a>
## 使用案例
[imdb](https://github.com/PaddlePaddle/Serving/tree/develop/examples/C%2B%2B/imdb) 示例为例,介绍 ABTest 的使用,部署有5个步骤:
1. 安装 Paddle Serving Wheels
2. 下载多个模型并保存模型参数
3. 启动 A,B,C 3个服务
4. 客户端注册 A,B,C 服务端地址
5. 启动客户端并验证结果
<a name="2.1"></a>
**一.安装 Paddle Serving Wheels**
使用 ABTest 功能的前提是使用 PYTHON SDK,因此需要安装 `paddle_serving_client` 的 wheel 包。[安装方法](./2-1_Docker_Images_CN.md) 如下:
```
pip3 install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
```
<a name="2.2"></a>
**二.下载多个模型并保存模型参数**
本示例已提供了一键下载脚本 `sh get_data.sh`,下载自训练的模型 `bow``cnn``lstm` 3种不同方式训练的模型。
```
sh get_data.sh
```
3种模型的所有文件如下所示,已为用户提前保存模型参数,无需执行保存操作。
```
├── imdb_bow_client_conf
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── imdb_bow_model
│   ├── embedding_0.w_0
│   ├── fc_0.b_0
│   ├── fc_0.w_0
│   ├── fc_1.b_0
│   ├── fc_1.w_0
│   ├── fc_2.b_0
│   ├── fc_2.w_0
│   ├── fluid_time_file
│   ├── __model__
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── imdb_cnn_client_conf
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── imdb_cnn_model
│   ├── embedding_0.w_0
│   ├── fc_0.b_0
│   ├── fc_0.w_0
│   ├── fc_1.b_0
│   ├── fc_1.w_0
│   ├── fluid_time_file
│   ├── __model__
│   ├── sequence_conv_0.b_0
│   ├── sequence_conv_0.w_0
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── imdb_lstm_client_conf
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── imdb_lstm_model
│   ├── embedding_0.w_0
│   ├── fc_0.b_0
│   ├── fc_0.w_0
│   ├── fc_1.b_0
│   ├── fc_1.w_0
│   ├── fc_2.b_0
│   ├── fc_2.w_0
│   ├── lstm_0.b_0
│   ├── lstm_0.w_0
│   ├── __model__
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
```
虽然3个模型的网络结构不同,但是 `feed var``fetch_var` 都是相同的便于做 ABTest。
```
feed_var {
name: "words"
alias_name: "words"
is_lod_tensor: true
feed_type: 0
shape: -1
}
fetch_var {
name: "fc_2.tmp_2"
alias_name: "prediction"
is_lod_tensor: false
fetch_type: 1
shape: 2
}
```
<a name="2.3"></a>
**三.启动 A,B,C 3个服务**
后台启动 `bow``cnn``lstm` 模型服务:
```python
## 启动 bow 模型服务
python3 -m paddle_serving_server.serve --model imdb_bow_model/ --port 9297 >/dev/null 2>&1 &
## 启动 cnn 模型服务
python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9298 >/dev/null 2>&1 &
## 启动 lstm 模型服务
python3 -m paddle_serving_server.serve --model imdb_lstm_model/ --port 9299 >/dev/null 2>&1 &
```
<a name="2.4"></a>
**四.客户端注册 A,B,C 服务端地址**
使用 `paddle_serving_client``Client::add_variant(self, tag, cluster, variant_weight)` 接口注册服务标签、服务地址和权重。框架会将所有权重求和后计算每个服务的比例。本示例中,bow 服务的权重是10,cnn 服务的权重是30, lstm的权重是60,每次请求分别请求到3个服务的比例是10%、30%和60%。
```
from paddle_serving_client import Client
from paddle_serving_app.reader.imdb_reader import IMDBDataset
import sys
import numpy as np
client = Client()
client.load_client_config(sys.argv[1])
client.add_variant("bow", ["127.0.0.1:9297"], 10)
client.add_variant("cnn", ["127.0.0.1:9298"], 30)
client.add_variant("lstm", ["127.0.0.1:9299"], 60)
client.connect()
```
如要在结果中打印请求到了哪个服务,在 `client.predict(feed, fetch, batch, need_variant_tag, logid)` 中设置 `need_variant_tag=True`
<a name="2.5"></a>
**五.启动客户端并验证结果**
运行命令:
```
head test_data/part-0 | python3.7 abtest_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
```
运行结果如下,10次请求中,bow 服务2次,cnn 服务3次,lstm 服务5次,与设置的比例基本相近。
```
I0506 04:02:46.720135 44567 naming_service_thread.cpp:202] brpc::policy::ListNamingService("127.0.0.1:9297"): added 1
I0506 04:02:46.722630 44567 naming_service_thread.cpp:202] brpc::policy::ListNamingService("127.0.0.1:9298"): added 1
I0506 04:02:46.723577 44567 naming_service_thread.cpp:202] brpc::policy::ListNamingService("127.0.0.1:9299"): added 1
I0506 04:02:46.814075 44567 general_model.cpp:490] [client]logid=0,client_cost=9.889ms,server_cost=6.283ms.
server_tag=lstm prediction=[0.500398 0.49960205]
I0506 04:02:46.826339 44567 general_model.cpp:490] [client]logid=0,client_cost=10.261ms,server_cost=9.503ms.
server_tag=lstm prediction=[0.5007235 0.49927652]
I0506 04:02:46.828992 44567 general_model.cpp:490] [client]logid=0,client_cost=1.667ms,server_cost=0.741ms.
server_tag=bow prediction=[0.25859657 0.74140346]
I0506 04:02:46.843299 44567 general_model.cpp:490] [client]logid=0,client_cost=13.402ms,server_cost=12.827ms.
server_tag=lstm prediction=[0.50039905 0.4996009 ]
I0506 04:02:46.850219 44567 general_model.cpp:490] [client]logid=0,client_cost=5.129ms,server_cost=4.332ms.
server_tag=cnn prediction=[0.6369219 0.36307803]
I0506 04:02:46.854203 44567 general_model.cpp:490] [client]logid=0,client_cost=2.804ms,server_cost=0.782ms.
server_tag=bow prediction=[0.15088597 0.849114 ]
I0506 04:02:46.858268 44567 general_model.cpp:490] [client]logid=0,client_cost=3.292ms,server_cost=2.677ms.
server_tag=cnn prediction=[0.4608788 0.5391212]
I0506 04:02:46.869217 44567 general_model.cpp:490] [client]logid=0,client_cost=10.13ms,server_cost=9.556ms.
server_tag=lstm prediction=[0.5000269 0.49997318]
I0506 04:02:46.883790 44567 general_model.cpp:490] [client]logid=0,client_cost=13.312ms,server_cost=12.822ms.
server_tag=lstm prediction=[0.50083774 0.49916226]
I0506 04:02:46.887256 44567 general_model.cpp:490] [client]logid=0,client_cost=2.432ms,server_cost=1.812ms.
server_tag=cnn prediction=[0.47895813 0.52104187]
```
......@@ -8,5 +8,6 @@ Python Pipeline 使用案例请阅读[Python Pipeline 快速部署案例](./3-2_
通过阅读以下内容掌握 Python Pipeline 核心功能和使用方法、高阶功能用法和性能优化指南等。
- [Python Pipeline 框架设计](7-1_Python_Pipeline_Design_CN.md)
- [Python Pipeline 高阶用法](7-2_Python_Pipeline_Senior_CN.md)
- [Python Pipeline 核心功能](7-2_Python_Pipeline_Senior_CN.md)
- [Python Pipeline 优化指南](7-3_Python_Pipeline_Optimize_CN.md)
- [Python Pipeline 性能指标](7-4_Python_Pipeline_Benchmark_CN.md)
# Python Pipeline 核心功能
# Python Pipeline 框架设计
- [目标](#1)
- [框架设计](#2)
- [2.1 网络层设计](#2.1)
- [2.2 图执行引擎层](#2.2)
- [2.3 服务日志](#2.3)
- [2.4 错误信息](#2.4)
- [自定义信息](#3)
- [3.1 自定义 Web 服务 URL](#3.1)
- [3.2 自定义服务输入和输出结构](#3.2)
- [3.3 自定义服务并发和模型配置](#3.3)
- [3.4 自定义推理过程](#3.4)
- [3.5 自定义业务错误类型](#3.5)
<a name="1"></a>
## 目标
为了解决多个深度学习模型组合的复杂问题,Paddle Serving 团队设计了一个通用端到端多模型组合框架,其核心特点包括:
1. 通用性:框架既要满足通用模型的输入类型,又要满足模型组合的复杂拓扑关系。
......@@ -7,6 +24,7 @@
3. 高可用性:高可用的架构依赖每个服务的健壮性,服务状态可查询、异常可监控和管理是必备条件。
4. 易于开发与调试:使用 Python 语言开发可大幅提升研发效率,运行的错误信息准确帮助开发者快速定位问题。
<a name="2"></a>
## 框架设计
Python Pipeline 框架分为网络服务层和图执行引擎2部分,网络服务层处理多种网络协议请求和通用输入参数问题,图执行引擎层解决复杂拓扑关系。如下图所示
......@@ -15,6 +33,8 @@ Python Pipeline 框架分为网络服务层和图执行引擎2部分,网络服
<img src='../images/pipeline_serving-image1.png' height = "250" align="middle"/>
</div>
<a name="2.1"></a>
**一.网络服务层**
网络服务层包括了 gRPC-gateway 和 gRPC Server。gPRC gateway 接收 HTTP 请求,打包成 proto 格式后转发给 gRPC Server,一套处理程序可同时处理 HTTP、gRPC 2种类型请求。
......@@ -58,6 +78,18 @@ ocr_service.prepare_pipeline_config("config.yml")
ocr_service.run_service()
```
与网络框架相关的配置在 `config.yml` 中设置。其中 `worker_num` 表示框架主线程 gRPC 线程池工作线程数,可理解成网络同步线程并发数。
其次,`rpc_port``http_port` 是服务端口,可同时开启,不允许同时为空。
```
worker_num: 10
# http 和 gRPC 服务端口
rpc_port: 9988
http_port: 18089
```
<a name="2.2"></a>
**二.图执行引擎层**
......@@ -126,15 +158,37 @@ Channel的设计原则:
<img src='../images/pipeline_serving-image3.png' height = "500" align="middle"/>
</div>
<a name="2.3"></a>
**三. 服务日志**
**三.服务日志**
Pipeline 服务日志在当前目录的 `PipelineServingLogs` 目录下,有3种类型日志,分别是 `pipeline.log``pipeline.log.wf``pipeline.tracer`
- `pipeline.log` : 记录 debug & info日志信息
- `pipeline.log.wf` : 记录 warning & error日志
- `pipeline.tracer` : 统计各个阶段耗时、channel 堆积信息
```
├── config.yml
├── get_data.sh
├── PipelineServingLogs
│   ├── pipeline.log
│   ├── pipeline.log.wf
│   └── pipeline.tracer
├── README_CN.md
├── README.md
├── uci_housing_client
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── uci_housing_model
│   ├── fc_0.b_0
│   ├── fc_0.w_0
│   ├── __model__
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── web_service_java.py
└── web_service.py
```
在服务发生异常时,错误信息会记录在 pipeline.log.wf 日志中。打印 tracer 日志要求在 config.yml 的 DAG 属性中添加 tracer 配置。
1. 日志与请求的唯一标识
......@@ -177,9 +231,46 @@ Pipeline 的日志模块在 `logger.py` 中定义,使用了 `logging.handlers.
```
**四. 服务超时与重试**
<a name="2.4"></a>
**四. 错误信息**
框架提供的错误信息如下所示, 完整信息在 `error_catch.py``CustomExceptionCode` 类中定义。
| 错误码 | 说明 |
| :---: | :-------------: |
| 0 | 成功 |
| 50 ~ 999 | 产品错误 |
| 3000 ~ 3999 | 框架内部服务错误 |
| 4000 ~ 4999 | 配置错误 |
| 5000 ~ 5999 | 用户输入错误 |
| 6000 ~ 6999 | 超时错误 |
| 7000 ~ 7999 | 类型检查错误 |
| 8000 ~ 8999 | 内部通讯错误 |
| 9000 ~ 9999 | 推理错误 |
| 10000 ~ | 其他错误 |
具体错误信息如下:
```
class CustomExceptionCode(enum.Enum):
OK = 0
PRODUCT_ERROR = 50
NOT_IMPLEMENTED = 3000
CLOSED_ERROR = 3001
NO_SERVICE = 3002
INIT_ERROR = 3003
CONF_ERROR = 4000
INPUT_PARAMS_ERROR = 5000
TIMEOUT = 6000
TYPE_ERROR = 7000
RPC_PACKAGE_ERROR = 8000
CLIENT_ERROR = 9000
UNKNOW = 10000
```
<a name="3"></a>
## 自定义信息
......@@ -190,8 +281,9 @@ Pipeline 的日志模块在 `logger.py` 中定义,使用了 `logging.handlers.
- 自定义推理过程
- 自定义业务错误类型
<a name="3.1"></a>
1. 自定义 Web 服务 URL
**一.自定义 Web 服务 URL**
在 Web 服务中自定义服务名称是常见操作,尤其是将已有服务迁移到新框架。URL 中核心字段包括 `ip``port``name``method`,根据最新部署的环境信息设置前2个字段,重点介绍如何设置 `name``method`,框架提供默认的 `methon``prediciton`,如 `http://127.0.0.1:9999/ocr/prediction`
......@@ -230,8 +322,9 @@ service PipelineService {
}
};
```
<a name="3.2"></a>
2. 自定义服务输入和输出结构
**二.自定义服务输入和输出结构**
输入和输出结构包括 proto 中 Request 和 Response 结构,以及 Op 前后处理返回。
......@@ -241,10 +334,15 @@ service PipelineService {
修改后,需要[重新编译]()
3. 自定义服务并发和模型配置
<a name="3.3"></a>
**三.自定义服务并发和模型配置**
完整的配置信息可参考[配置信息]()
4. 自定义推理过程
<a name="3.4"></a>
**四.自定义推理过程**
推理 Op 为开发者提供3个外部函数接口:
......@@ -369,8 +467,9 @@ class ResponseOp(Op):
return resp
```
<a name="3.5"></a>
5. 自定义业务错误类型
**五.自定义业务错误类型**
用户可根据业务场景自定义错误码,继承 ProductErrCode,在 Op 的 preprocess 或 postprocess 中返回列表中返回,下一阶段处理会根据自定义错误码跳过后置OP处理。
```python
......
# Python Pipeline 高阶用法
# Python Pipeline 核心功能
从设计上,Python Pipeline 框架实现轻量级的服务化部署,提供了丰富的核心功能,既能满足服务基本使用,又能满足特性需求。
- [安装与环境检查](#1)
- [服务启动与关闭](#2)
- [本地与远程推理](#3)
- [批量推理](#4)
- [4.1 客户端打包批量数据](#4.1)
- [4.2 服务端合并多个请求动态合并批量](#4.2)
- [4.3 Mini-Batch](#4.3)
- [单机多卡推理](#5)
- [多种计算芯片上推理](#6)
- [TensorRT 推理加速](#7)
- [MKLDNN 推理加速](#8)
- [低精度推理](#9)
- [9.1 CPU 低精度推理](#9.1)
- [9.2 GPU 和 TensorRT 低精度推理](#9.2)
- [9.3 性能测试](#9.3)
- [复杂图结构 DAG 跳过某个 Op 运行](#10)
<a name="1"></a>
## 安装与环境检查
在运行 Python Pipeline 服务前,确保当前环境下可部署且通过[安装指南](./2-0_Index_CN.md)已完成安装。其次,`v0.8.0`及以上版本提供了环境检查功能,检验环境是否安装正确。
输入以下命令,进入环境检查程序。
```python
python3 -m paddle_serving_server.serve check
```
在复杂业务场景中使用常规功能无法满足需求,本文介绍一些高阶用法。
- DAG 结构跳过某个 Op 运行
- 批量推理
- 单机多卡推理
- 多种计算芯片上推理
- 低精度推理
- TensorRT 推理加速
- MKLDNN 推理加速
在环境检验程序中输入多条指令来检查,例如 `check_pipeline``check_all`等,完整指令列表如下。
| 指令 | 描述|
|---------|----|
|check_all | 检查 Paddle Inference、Pipeline Serving、C++ Serving。只打印检测结果,不记录日志|
|check_pipeline | 检查 Pipeline Serving,只打印检测结果,不记录日志|
|check_cpp | 检查 C++ Serving,只打印检测结果,不记录日志|
|check_inference | 检查 Paddle Inference 是否安装正确,只打印检测结果,不记录日志|
|debug | 发生报错后,该命令将打印提示日志到屏幕,并记录详细日志文件|
|exit | 退出|
**一.DAG 结构跳过某个 Op 运行**
此应用场景一般在 Op 前后处理中有 if 条件判断时,不满足条件时,跳过后面处理。实际做法是在跳过此 Op 的 process 阶段,只要在 preprocess 做好判断,跳过 process 阶段,在和 postprocess 后直接返回即可。
preprocess 返回结果列表的第二个结果是 `is_skip_process=True` 表示是否跳过当前 Op 的 process 阶段,直接进入 postprocess 处理
程序会分别运行 cpu 和 gpu 示例。运行成功则打印 `Pipeline cpu environment running success
``Pipeline gpu environment running success`
```python
## Op::preprocess() 函数实现
def preprocess(self, input_dicts, data_id, log_id):
"""
In preprocess stage, assembling data for process stage. users can
override this function for model feed features.
Args:
input_dicts: input data to be preprocessed
data_id: inner unique id
log_id: global unique id for RTT
Return:
input_dict: data for process stage
is_skip_process: skip process stage or not, False default
prod_errcode: None default, otherwise, product errores occured.
It is handled in the same way as exception.
prod_errinfo: "" default
"""
# multiple previous Op
if len(input_dicts) != 1:
_LOGGER.critical(
self._log(
"Failed to run preprocess: this Op has multiple previous "
"inputs. Please override this func."))
os._exit(-1)
(_, input_dict), = input_dicts.items()
return input_dict, False, None, ""
```
/usr/local/lib/python3.7/runpy.py:125: RuntimeWarning: 'paddle_serving_server.serve' found in sys.modules after import of package 'paddle_serving_server', but prior to execution of 'paddle_serving_server.serve'; this may result in unpredictable behaviour
warn(RuntimeWarning(msg))
Welcome to the check env shell.Type help to list commands.
(Cmd) check_pipeline
Pipeline cpu environment running success
Pipeline gpu environment running success
```
以下示例 Jump::preprocess() 重载了原函数,返回了 True 字段
运行失败时,错误信息会记录到当前目录下 `stderr.log` 文件 和 `Pipeline_test_cpu/PipelineServingLogs` 目录下。用户可根据错误信息调试。
```
(Cmd) check_all
PaddlePaddle inference environment running success
C++ cpu environment running success
C++ gpu environment running failure, if you need this environment, please refer to https://github.com/PaddlePaddle/Serving/blob/develop/doc/Install_CN.md
Traceback (most recent call last):
File "/usr/local/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/local/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.7/site-packages/paddle_serving_server/serve.py", line 541, in <module>
Check_Env_Shell().cmdloop()
File "/usr/local/lib/python3.7/cmd.py", line 138, in cmdloop
stop = self.onecmd(line)
File "/usr/local/lib/python3.7/cmd.py", line 217, in onecmd
return func(arg)
File "/usr/local/lib/python3.7/site-packages/paddle_serving_server/serve.py", line 501, in do_check_all
check_env("all")
File "/usr/local/lib/python3.7/site-packages/paddle_serving_server/env_check/run.py", line 94, in check_env
run_test_cases(pipeline_test_cases, "Pipeline", is_open_std)
File "/usr/local/lib/python3.7/site-packages/paddle_serving_server/env_check/run.py", line 66, in run_test_cases
mv_log_to_new_dir(new_dir_path)
File "/usr/local/lib/python3.7/site-packages/paddle_serving_server/env_check/run.py", line 48, in mv_log_to_new_dir
shutil.move(file_path, dir_path)
File "/usr/local/lib/python3.7/shutil.py", line 555, in move
raise Error("Destination path '%s' already exists" % real_dst)
shutil.Error: Destination path '/home/work/Pipeline_test_cpu/PipelineServingLogs' already exists
```
<a name="2"></a>
## 服务启动与关闭
服务启动需要三类文件,PYTHON 程序、模型文件和配置文件。以[Python Pipeline 快速部署案例](./3-2_QuickStart_Pipeline_OCR_CN.md)为例,
```
.
├── config.yml
├── imgs
│   └── ggg.png
├── ocr_det_client
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── ocr_det_model
│   ├── inference.pdiparams
│   ├── inference.pdmodel
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── ocr_det.tar.gz
├── ocr_rec_client
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── ocr_rec_model
│   ├── inference.pdiparams
│   ├── inference.pdmodel
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── pipeline_http_client.py
├── pipeline_rpc_client.py
├── ppocr_keys_v1.txt
└── web_service.py
```
启动服务端程序运行 `web_service.py`,启动客户端程序运行 `pipeline_http_client.py``pipeline_rpc_client.py`。服务端启动的日志信息在 `PipelineServingLogs` 目录下可用于调试。
```
├── PipelineServingLogs
│   ├── pipeline.log
│   ├── pipeline.log.wf
│   └── pipeline.tracer
```
关闭程序可使用2种方式,
- 前台关闭程序:`Ctrl+C` 关停服务
- 后台关闭程序:
```python
class JumpOp(Op):
## Overload func JumpOp::preprocess
def preprocess(self, input_dicts, data_id, log_id):
(_, input_dict), = input_dicts.items()
if input_dict.has_key("jump"):
return input_dict, True, None, ""
else
return input_dict, False, None, ""
python3 -m paddle_serving_server.serve stop # 触发 SIGINT 信号
python3 -m paddle_serving_server.serve kill # 触发 SIGKILL 信号,强制关闭
```
<a name="3"></a>
## 本地与远程推理
本地推理是指在服务所在机器环境下开启多进程推理,而远程推理是指本地服务请求远程 C++ Serving 推理服务。
本地推理的优势是实现简单,一般本地处理相比于远程推理耗时更低。而远程推理的优势是可实现 Python Pipeline 较难实现的功能,如部署加密模型,大模型推理。
Python Pipeline 的本地推理可参考如下配置,在 `uci` op 中 增加 `local_service_conf` 配置,并设置 `client_type: local_predictor`
```
op:
uci:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 10
#当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
#uci模型路径
model_config: uci_housing_model
#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 0
#计算硬件ID,优先由device_type决定硬件类型。devices为""或空缺时为CPU预测;当为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "" # "0,1"
#client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["price"]
```
Python Pipeline 的远程推理可参考如下配置,设置 `client_type: brpc``server_endpoints``timeout` 和本地 `client_config`
```
op:
bow:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 1
#client连接类型,brpc
client_type: brpc
#Serving交互重试次数,默认不重试
retry: 1
#Serving交互超时时间, 单位ms
timeout: 3000
#Serving IPs
server_endpoints: ["127.0.0.1:9393"]
#bow模型client端配置
client_config: "imdb_bow_client_conf/serving_client_conf.prototxt"
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["prediction"]
```
<a name="4"></a>
**二. 批量推理**
## 批量推理
Pipeline 支持批量推理,通过增大 batch size 可以提高 GPU 利用率。Python Pipeline 支持3种 batch 形式以及适用的场景如下:
- 场景1:客户端打包批量数据(Client Batch)
- 场景2:服务端合并多个请求动态合并批量(Server auto-batching)
- 场景3:服务端拆分一个批量数据推理请求成为多个小块推理(Server mini-batch)
- 场景3:拆分一个大批量的推理请求为多个小批量推理请求(Server mini-batch)
<a name="4.1"></a>
1. 客户端打包批量数据
**一.客户端打包批量数据**
当输入数据是 numpy 类型,如shape 为[4, 3, 512, 512]的 numpy 数据,即4张图片,可直接作为输入数据。
当输入数据的 shape 不同时,需要按最大的shape的尺寸 Padding 对齐后发送给服务端
2. 服务端合并多个请求动态合并批量
<a name="4.2"></a>
**二.服务端合并多个请求动态合并批量**
有助于提升吞吐和计算资源的利用率,当多个请求的 shape 尺寸不相同时,不支持合并。当前有2种合并策略,分别是:
- 等待时间与最大批量结合(推荐):结合`batch_size``auto_batching_timeout`配合使用,实际请求的批量条数超过`batch_size`时会立即执行,不超过时会等待`auto_batching_timeout`时间再执行
......@@ -119,9 +266,11 @@ op:
```
<a name="4.3"></a>
**三.Mini-Batch**
3.服务端拆分一个批量数据推理请求成为多个小块推理:会降低批量数据 Padding 对齐的大小,从而提升速度。可参考 [OCR 示例](),核心思路是拆分数据成多个小批量,放入 list 对象 feed_list 并返回
拆分一个批量数据推理请求成为多个小块推理:会降低批量数据 Padding 对齐的大小,从而提升速度。可参考 [OCR 示例](),核心思路是拆分数据成多个小批量,放入 list 对象 feed_list 并返回
```
def preprocess(self, input_dicts, data_id, log_id):
......@@ -181,8 +330,9 @@ def preprocess(self, input_dicts, data_id, log_id):
return feed_list, False, None, ""
```
<a name="5"></a>
**三. 单机多卡推理**
## 单机多卡推理
单机多卡推理与 `config.yml` 中配置4个参数关系紧密,`is_thread_op``concurrency``device_type``devices`,必须在进程模型和 GPU 模式,每张卡上可分配多个进程,即 M 个 Op 进程与 N 个 GPU 卡绑定。
```
......@@ -218,8 +368,9 @@ op:
对于更灵活的进程与 GPU 卡绑定方式,会持续开发。
<a name="6"></a>
**四. 多种计算芯片上推理**
## 多种计算芯片上推理
除了支持 CPU、GPU 芯片推理之外,Python Pipeline 还支持在多种计算硬件上推理。根据 `config.yml` 中的 `device_type``devices`来设置推理硬件和加速库如下:
- CPU(Intel) : 0
......@@ -232,27 +383,99 @@ op:
当不设置`device_type`时,根据 `devices` 来设置,即当 `device_type` 为 "" 或空缺时为 CPU 推理;当有设定如"0,1,2"时,为 GPU 推理,并指定 GPU 卡。
以使用 GPU 的编号为0和1号卡并开启 TensorRT 为例,TensorRT 要配合 `ir_optim` 一同开启,`config.yml`详细配置如下:
以使用 XPU 的编号为0卡为例,配合 `ir_optim` 一同开启,`config.yml`详细配置如下:
```
# 计算硬件类型
device_type: 2
device_type: 4
# 计算硬件ID,优先由device_type决定硬件类型
devices: "0,1"
devices: "0"
# 开启ir优化
ir_optim: True
```
<a name="7"></a>
**五. 低精度推理**
Pipeline Serving支持低精度推理,CPU、GPU和TensoRT支持的精度类型如下图所示:
## TensorRT 推理加速
TensorRT 是一个高性能的深度学习推理优化器,在 Nvdia 的 GPU 硬件平台运行的推理框架,为深度学习应用提供低延迟、高吞吐率的部署推理。
通过设置`device_type``devices``ir_optim` 字段即可实现 TensorRT 高性能推理。必须同时设置 `ir_optim: True` 才能开启 TensorRT。
```
op:
imagenet:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 1
#当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
#uci模型路径
model_config: serving_server/
#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 2
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "1" # "0,1"
#client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["score"]
#开启 ir_optim
ir_optim: True
```
<a name="8"></a>
## MKL-DNN 推理加速
MKL-DNN 针对 Intel CPU 和 GPU 的数学核心库,对深度学习网络进行算子和指令集的性能优化,从而提升执行速度。Paddle 框架已集成了 MKL-DNN。
目前仅支持 Intel CPU 推理加速,通过设置`device_type``devices``use_mkldnn` 字段使用 MKL-DNN。
```
op:
imagenet:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 1
#当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
#uci模型路径
model_config: serving_server/
#计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 0
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: ""
#client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["score"]
#开启 MKLDNN
use_mkldnn: True
```
<a name="9"></a>
## 低精度推理
Pipeline Serving支持低精度推理,CPU、GPU和TensoRT支持的精度类型如下图所示:
低精度推理需要有量化模型,配合`config.yml`配置一起使用,以[低精度示例]() 为例
1. CPU 低精度推理配置
<a name="9.1"></a>
**一.CPU 低精度推理**
通过设置,`device_type``devices` 字段使用 CPU 推理,通过调整`precision``thread_num``use_mkldnn`参数选择低精度和性能调优。
......@@ -290,9 +513,11 @@ op:
use_mkldnn: True
```
2. GPU + TensorRT 低精度推理
<a name="9.2"></a>
**二.GPU 和 TensorRT 低精度推理**
通过设置`device_type``devices` 字段使用原生 GPU 或 TensorRT 推理,通过调整`precision``ir_optim``use_calib`参数选择低精度和性能调优,如开启 TensorRT,必须一同开启`ir_optim``use_calib`仅配合 int8 使用。
通过设置`device_type``devices` 字段使用原生 GPU 或 TensorRT 推理,通过调整`precision``ir_optim``use_calib`参数选择低精度和性能调优,如开启 TensorRT,必须一同开启`ir_optim``use_calib`仅配合 int8 使用。
```
op:
imagenet:
......@@ -327,8 +552,9 @@ op:
ir_optim: True
```
<a name="9.3"></a>
3. 性能测试
**三.性能测试**
测试环境如下:
- GPU 型号: A100-40GB
......@@ -345,7 +571,6 @@ op:
- GPU + int8 + ir_optim + TensorRT + use_calib : 15.1 ms
- GPU + fp16 + ir_optim + TensorRT : 17.2 ms
CPU 推理性能较好的配置是
- CPU + bf16 + MKLDNN : 18.2 ms
- CPU + fp32 + thread_num=10 : 18.4 ms
......@@ -354,3 +579,50 @@ CPU 推理性能较好的配置是
<div align=center>
<img src='../images/low_precision_profile.png' height = "600" align="middle"/>
</div
<a name="10"></a>
## 复杂图结构 DAG 跳过某个 Op 运行
此应用场景一般在 Op 前后处理中有 if 条件判断时,不满足条件时,跳过后面处理。实际做法是在跳过此 Op 的 process 阶段,只要在 preprocess 做好判断,跳过 process 阶段,在和 postprocess 后直接返回即可。
preprocess 返回结果列表的第二个结果是 `is_skip_process=True` 表示是否跳过当前 Op 的 process 阶段,直接进入 postprocess 处理。
```python
## Op::preprocess() 函数实现
def preprocess(self, input_dicts, data_id, log_id):
"""
In preprocess stage, assembling data for process stage. users can
override this function for model feed features.
Args:
input_dicts: input data to be preprocessed
data_id: inner unique id
log_id: global unique id for RTT
Return:
input_dict: data for process stage
is_skip_process: skip process stage or not, False default
prod_errcode: None default, otherwise, product errores occured.
It is handled in the same way as exception.
prod_errinfo: "" default
"""
# multiple previous Op
if len(input_dicts) != 1:
_LOGGER.critical(
self._log(
"Failed to run preprocess: this Op has multiple previous "
"inputs. Please override this func."))
os._exit(-1)
(_, input_dict), = input_dicts.items()
return input_dict, False, None, ""
```
以下示例 Jump::preprocess() 重载了原函数,返回了 True 字段
```python
class JumpOp(Op):
## Overload func JumpOp::preprocess
def preprocess(self, input_dicts, data_id, log_id):
(_, input_dict), = input_dicts.items()
if input_dict.has_key("jump"):
return input_dict, True, None, ""
else
return input_dict, False, None, ""
```
# Python Pipeline 优化指南
- [优化响应时长](#1)
- [1.1 分析响应时长](#1.1)
- [Pipeline Trace Tool](#1.1.1)
- [Pipeline Profile Tool](#1.1.2)
- [1.2 优化思路](#1.2)
- [优化服务吞吐](#2)
- [2.1 分析吞吐瓶颈](#2.1)
- [2.2 优化思路](#2.2)
- [增加 Op 并发](#2.2.1)
- [动态批量](#2.2.2)
- [CPU 与 GPU 处理分离](#2.2.3)
## 如何通过 Timeline 工具进行优化
为了更好地对性能进行优化,Python Pipeline 提供了 Timeline 工具,对整个服务的各个阶段时间进行打点。
通常,服务的性能优化是基于耗时分析,首先要掌握服务运行的各阶段耗时信息,从中找到耗时最长的性能瓶颈再做针对性优化。对于模型推理服务化不仅要关注耗时,由于 GPU 芯片昂贵,更要关注服务吞吐,从而提升 GPU 利用率实现降本增效。因此,模型推理服务化可总结为:
- 优化响应时长
- 优化服务吞吐
## 在 Server 端输出 Profile 信息
经过分析和调优后,各个阶段实现整体服务的性能最优。
Server 端用 yaml 中的 `use_profile` 字段进行控制:
<a name="1"></a>
```yaml
## 优化响应时长
首先,优化响应时长的主要思路首先要掌握各阶段耗时,并分析出性能瓶颈或者耗时占比较高的阶段,再针对性能瓶颈做专项优化。
Paddle Serving 提供2种耗时分析工具,`Pipeline Trace Tool``Pipeline Profile Tool`。2个工具的特点如下:
- Pipeline Trace Tool : 统计服务端所有进程各个阶段的平均耗时,包括每个 `Op``Channel`,用于定量分析。
- Pipeline Profile Tool : 是可视化 Trace View 工具,生成多进程并发效果图,用定性和定量分析执行和并发效果。
<a name="1.1"></a>
**一.耗时分析**
<a name="1.1.1"></a>
1.Pipeline Trace Tool
`Pipeline Trace Tool` 统计每个 `Op``Channel` 中各阶段的处理耗时,
开启方法在配置文件 `config.yml``dag` 区段内添加 `tracer` 字段,框架会每隔 `interval_s` 时间生成 Trace 信息。
```
dag:
use_profile: true
#op资源类型, True, 为线程模型;False,为进程模型
is_thread_op: True
#tracer, 跟踪框架吞吐,每个OP和channel的工作情况。无tracer时不生成数据
tracer:
#每次trace的时间间隔,单位秒/s
interval_s: 10
```
生成的 Trace 信息保存在 `./PipelineServingLogs/pipeline.tracer` 日志中。如下图所示
```
==================== TRACER ======================
Op(uci):
in[8473.507333333333 ms]: # 等待前置 Channel 中数据放入 Op 的耗时,如长时间无请求,此值会变大
prep[0.6753333333333333 ms] # 推理前处理 preprocess 阶段耗时
midp[26.476333333333333 ms] # 推理 process 阶段耗时
postp[1.8616666666666666 ms] # 推理后处理 postprocess 阶段耗时
out[1.3236666666666668 ms] # 后处理结果放入后置 channel 耗时
idle[0.9965882097324374] # 框架自循环耗时,间隔 1 ms,如此值很大说明系统负载高,调度变慢
DAGExecutor:
Query count[30] # interval_s 间隔时间内请求数量
QPS[27.35 q/s] # interval_s 间隔时间内服务 QPS
Succ[1.0] # interval_s 间隔时间内请求成功率
Error req[] # 异常请求信息
Latency:
ave[36.55233333333334 ms] # 平均延时
.50[8.702 ms] # 50分位延时
.60[8.702 ms] # 60分位延时
.70[92.346 ms] # 70分位延时
.80[92.346 ms] # 70分位延时
.90[92.346 ms] # 90分位延时
.95[92.346 ms] # 95分位延时
.99[92.346 ms] # 99分位延时
Channel (server worker num[1]):
chl0(In: ['@DAGExecutor'], Out: ['uci']) size[0/0] # 框架 RequestOp 与 uci Op 之间 Channel 中堆积请求数。此值较大,说明下游 uci Op 消费能力不足。
chl1(In: ['uci'], Out: ['@DAGExecutor']) size[0/0] # uci Op 与 框架 ResponseOp 之间 Channel 中堆积的请求数。此值较大,说明下游 ReponseOp 消费能力不足。
==================== TRACER ======================
```
<a name="1.1.2"></a>
开启该功能后,Server 端在预测的过程中会将对应的日志信息打印到标准输出,为了更直观地展现各阶段的耗时,提供 Analyst 模块对日志文件做进一步的分析处理。
2.Pipeline Profile Tool
使用时先将 Server 的输出保存到文件,以 `profile.txt` 为例,脚本将日志中的时间打点信息转换成 json 格式保存到 `trace` 文件,`trace` 文件可以通过 chrome 浏览器的 tracing 功能进行可视化。
```
dag:
#op资源类型, True, 为线程模型;False,为进程模型
is_thread_op: True
#使用性能分析, 默认为 False,imeline性能数据,对性能有一定影响
use_profile: True,
```
```python
开启后,Server 端在预测的过程中会将对应的日志信息打印到`标准输出`,为了更直观地展现各阶段的耗时,因此服务启动要使用如下命令:
```
python3.7 web_service.py > profile.txt 2>&1
```
服务接收请求后,输出 Profile 信息到 `profile.txt` 文件中。再粘贴如下代码到 `trace.py`, 使用框架提供 Analyst 模块对日志文件做进一步的分析处理。
```
from paddle_serving_server.pipeline import Analyst
import json
import sys
......@@ -30,50 +111,104 @@ if __name__ == "__main__":
analyst.save_trace(trace_filename)
```
具体操作:打开 chrome 浏览器,在地址栏输入 `chrome://tracing/` ,跳转至 tracing 页面,点击 load 按钮,打开保存的 `trace` 文件,即可将预测服务的各阶段时间信息可视化。
运行命令,脚本将日志中的时间打点信息转换成 json 格式保存到 `trace` 文件。
```
python3.7 trace.py
```
`trace` 文件可以通过 `chrome` 浏览器的 `tracing` 功能进行可视化。
```
打开 chrome 浏览器,在地址栏输入 chrome://tracing/ ,跳转至 tracing 页面,点击 load 按钮,打开保存的 trace 文件,即可将预测服务的各阶段时间信息可视化。
```
通过图示中并发请求的处理流程可观测到推理阶段的流水线状态,以及多个请求在推理阶段的`间隔`信息,进行优化。
## 在 Client 端输出 Profile 信息
<a name="1.2"></a>
Client 端在 `predict` 接口设置 `profile=True`,即可开启 Profile 功能。
**二.降低响应时长优化思路**
开启该功能后,Client 端在预测的过程中会将该次预测对应的日志信息打印到标准输出,后续分析处理同 Server。
根据 `Pipeline Trace Tool` 输出结果在不同阶段耗时长的问题,常见场景的优化方法如下:
- Op 推理阶段(midp) 耗时长:
- 增加 Op 并发度
- 开启 auto-batching (前提是多个请求的 shape 一致)
- 若批量数据中某条数据的 shape 很大,padding 很大导致推理很慢,可参考 OCR 示例中 mini-batch 方法。
- 开启 TensorRT/MKL-DNN 优化
- 开启低精度推理
- Op 前处理阶段(prep) 或 后处理阶段耗时长:
- 增加 OP 并发度
- 优化前后处理逻辑
- in/out 耗时长(channel 堆积>5)
- 检查 channel 传递的数据大小,可能为传输的数据大导致延迟大。
- 优化传入数据,不传递数据或压缩后再传入
- 增加 Op 并发度
- 减少上游 Op 并发度
## 分析方法
根据 `pipeline.tracer` 日志中的各个阶段耗时,按以下公式逐步分析出主要耗时在哪个阶段。
根据 `Pipeline Profile Tool` 输出结果优化流水行并发的效果
- 增加 Op 并发度,或调整不同 Op 的并发度
- 开启 auto-batching
此外,还有一些优化思路,如将 CPU 处理较慢的过程转换到 GPU 上处理等,客户端与服务端传输较大数据时,可使用共享内存方式传递内存或显存地址等。
<a name="2"></a>
## 优化服务吞吐
<a name="2.1"></a>
**一.分析吞吐瓶颈**
服务的吞吐量受到多种多因素条件制约,如 Op 处理时长、传输数据耗时、并发数和 DAG 图结构等,可以将这些因素进一步拆解,当传输数据不是极端庞大的时候,最重要因素是流水线中`最慢 Op 的处理时长和并发数`
```
单 OP 耗时
Op 处理时长
op_cost = process(pre + mid + post)
OP 期望并发数:
op_concurrency = 单OP耗时(s) * 期望QPS
服务吞吐量:
service_throughput = 1 / 最慢OP的耗时 * 并发数
service_throughput = 1 / 最慢 op_cost * 并发数
服务平响:
service_avg_cost = ∑op_concurrency 【关键路径】
Channel 堆积:
channel_acc_size = QPS(down - up) * time
批量预测平均耗时:
avg_batch_cost = (N * pre + mid + post) / N
```
<a name="2.2"></a>
## 优化思路
**二.优化思路**
根据长耗时在不同阶段,采用不同的优化方法.
- OP 推理阶段(mid-process):
- 增加 OP 并发度
- 开启 auto-batching (前提是多个请求的 shape 一致)
- 若批量数据中某条数据的 shape 很大,padding 很大导致推理很慢,可使用 mini-batch
- 开启 TensorRT/MKL-DNN 优化
- 开启低精度推理
- OP 前处理阶段(pre-process):
- 增加 OP 并发度
- 优化前处理逻辑
- in/out 耗时长(channel 堆积>5)
- 检查 channel 传递的数据大小和延迟
- 优化传入数据,不传递数据或压缩后再传入
- 增加 OP 并发度
- 减少上游 OP 并发度
优化吞吐的主要方法是 `增大 Op 并发数``自动批量``CPU 与 GPU 处理分离`
<a name="2.2.1"></a>
1.增加 Op 并发**
调整 Op 的并发数量通过设置 `is_thread_op: False` 进程类型 Op 和 `uci` Op 的 `concurrency` 字段
```
dag:
#op资源类型, True, 为线程模型;False,为进程模型
is_thread_op: False
op:
uci:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 10
```
Op 的进程数量不是越大越好,受到机器 CPU 核数、内存和显存大小的限制,推荐设置 Op 的并发数不超过系统 CPU 核数。
<a name="2.2.2"></a>
2.动态批量
动态批量是增加吞吐的有一种方法,开启方式可参考[Python Pipeline 核心功能](./7-2_Python_Pipeline_Senior_CN.md#批量推理)
<a name="2.2.3"></a>
3.CPU 与 GPU 处理分离
`CV` 模型中,对图片或视频的前后处理成为主要瓶颈时,可考虑此方案,即将前后处理过程独立成一个 Op 并独立设置并发度。
将 CPU 前后处理和 GPU 推理过程比例调整到服务最佳配比。以 OCR 为例,原有流水线设计为 `RequestOp -> DetOp -> RecOp -> ResponseOp`
根据耗时分析,`DetOp``RecOp` 的前处理耗时很长,因此,将2个模型前处理分离成独立 Op,最新的流水线设计为:
`RequestOp -> PreDetOp -> DetOp -> PreRecOp -> RecOp -> ResponseOp`,并调大 `PreDetOp``PreRecOp`的并发度,从而获得 20% 的性能提升。
由于增加了2次数据传递,单条请求的处理延时会增加。
# Python Pipeline 性能测试
- [测试环境](#1)
- [性能指标与结论](#2)
<a name="1"></a>
## 测试环境
测试环境如下表所示:
| | GPU | 显存 | CPU | 内存 |
|----------|---------|----------|----------------------------------------------|------|
| Serving端 | 4x Tesla P4-8GB | 7611MiB | Intel(R) Xeon(R) Gold 5117 CPU @ 2.00GHz 48核 | 216G |
| Client端 | 4x Tesla P4-8GB | 7611MiB | Intel(R) Xeon(R) Gold 5117 CPU @ 2.00GHz 48核 | 216G |
使用单卡GPU,未开启TensorRT。
模型:ResNet_v2_50
<a name="2"></a>
## 性能指标与结论
通过测试,使用 Python Pipeline 模式通过多进程并发,充分利用 GPU 显卡,具有较好的吞吐性能。
测试数据如下:
|model_name |thread_num |batch_size |CPU_util(%) |GPU_memory(mb) |GPU_util(%) |qps(samples/s) |total count |mean(ms) |median(ms) |80 percent(ms) |90 percent(ms) |99 percent(ms) |total cost(s) |each cost(s)|
|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--
|ResNet_v2_50 |1 |1 |2.2 |3327 |17.25 |17.633658869240787 |355 |56.428481238996476 |38.646728515625 |39.496826171875 |39.98369140625 |1273.1911083984373 |20.131953477859497 |20.033540725708008|
|ResNet_v2_50 |1 |4 |2.7 |3617 |28.122 |53.50748430453522 |268 |74.71539215543378 |74.6181640625 |75.3138671875 |75.6051025390625 |77.85322998046874 |20.03458046913147 |20.024930953979492|
|ResNet_v2_50 |1 |8 |1.7 |3877 |25.7869 |59.60582783086999 |150 |133.5897119140625 |132.7503662109375 |134.968310546875 |136.470703125 |140.79039062499996 |20.132259607315063 |20.03933620452881|
|ResNet_v2_50 |1 |16 |7.0 |4777 |27.0175 |63.2627646819339 |80 |252.30162048339844 |251.8448486328125 |253.046630859375 |253.91142578125 |263.361640625 |20.233070850372314 |20.18476152420044|
|ResNet_v2_50 |1 |32 |7.5 |6567 |38.532 |62.945314687348024 |40 |506.8969482421875 |507.3531494140625 |510.562353515625 |511.421240234375 |536.8068920898437 |20.335111618041992 |20.276386737823486|
|ResNet_v2_50 |2 |1 |4.7 |6567 |49.4828 |50.40600094376044 |1010 |39.63352195815285 |39.5345458984375 |40.452880859375 |41.1375 |42.940522460937494 |20.037296772003174 |20.01696753501892|
|ResNet_v2_50 |2 |4 |2.7 |6567 |44.4744 |83.4255836891382 |420 |95.38548002697172 |95.7069091796875 |97.599951171875 |98.098583984375 |102.39680908203125 |20.137707471847534 |20.03199553489685|
|ResNet_v2_50 |2 |8 |2.2 |6567 |42.898 |91.3727510505176 |230 |174.89108568274457 |175.0452880859375 |175.82001953125 |176.7634033203125 |178.64064453125002 |20.13729453086853 |20.1132071018219|
|ResNet_v2_50 |2 |16 |2.2 |6567 |45 |97.5591285698611 |124 |327.16720088835683 |328.6126708984375 |329.75185546875 |330.386962890625 |336.86397460937496 |20.336385011672974 |20.284939169883728|
|ResNet_v2_50 |2 |32 |3.2 |6567 |59.5714 |100.70765418116333 |64 |633.9812698364258 |637.8568115234375 |648.103515625 |650.7439697265625 |659.2212915039062 |20.336090803146362 |20.28787398338318|
|ResNet_v2_50 |4 |1 |3.1 |6567 |64.3333 |80.27845081929433 |1617 |49.56464230756223 |49.4873046875 |51.5537109375 |52.693408203125 |55.207568359374996 |20.142391681671143 |20.038144528865814|
|ResNet_v2_50 |4 |4 |3.3 |6567 |70.4563 |136.62061939701394 |688 |116.51574919944586 |121.8629150390625 |129.8181640625 |133.384423828125 |142.69500732421875 |20.143372297286987 |20.041599333286285|
|ResNet_v2_50 |4 |8 |3.0 |6567 |70.896 |158.46554975132275 |399 |201.30669079926378 |210.69775390625 |228.51748046875 |236.427294921875 |252.24822753906233 |20.143179416656494 |20.081032752990723|
|ResNet_v2_50 |4 |16 |3.2 |6567 |66.3832 |156.4935247130092 |197 |407.6668608224937 |423.974609375 |450.368212890625 |464.45986328125 |482.93658203125 |20.141408443450928 |20.078101694583893|
|ResNet_v2_50 |4 |32 |3.3 |6567 |72.4791 |162.01742190796557 |104 |785.5079204852765 |813.0341796875 |887.107958984375 |909.6556640625 |935.3334838867188 |20.541000843048096 |20.423666059970856|
|ResNet_v2_50 |8 |1 |3.5 |6567 |93.977 |115.9749228558386 |2337 |68.5580409078145 |65.45849609375 |76.13930664062501 |83.542041015625 |91.45666015624998 |20.15090799331665 |20.028797417879105|
|ResNet_v2_50 |8 |4 |4.2 |6567 |90.0952 |175.58748591910316 |889 |180.7330482920592 |170.5810546875 |218.99931640625 |240.06337890625002 |254.413759765625 |20.252012729644775 |20.084695398807526|
|ResNet_v2_50 |8 |8 |2.6 |6567 |93.8693 |206.76595246418208 |526 |306.52158695119414 |303.043212890625 |321.0791015625 |350.5477294921875 |400.32452392578125 |20.351513147354126 |20.15437400341034|
|ResNet_v2_50 |8 |16 |3.2 |6567 |85.7273 |205.31850043117367 |265 |614.1745522553066 |552.372314453125 |775.89169921875 |802.022607421875 |902.2763183593761 |20.650842428207397 |20.345011442899704|
|ResNet_v2_50 |8 |32 |5.0 |6567 |89.8717 |219.8410273718835 |146 |1138.4533474020761 |1039.640869140625 |1364.289794921875 |1474.6744384765625 |1788.2614379882834 |21.251720190048218 |20.777225106954575|
|ResNet_v2_50 |12 |1 |5.0 |6567 |89.4762 |110.00858327847862 |2218 |108.50048552943953 |103.015625 |121.09404296875003 |137.1392333984375 |151.80401123046872 |20.162063121795654 |20.055511037508648|
|ResNet_v2_50 |12 |4 |4.1 |6567 |77.7619 |153.7824464757549 |779 |309.68895575507463 |285.585205078125 |378.07421875 |413.481640625 |424.70853515625 |20.262390613555908 |20.104551911354065|
|ResNet_v2_50 |12 |8 |3.6 |6567 |72.6977 |165.36021780846013 |425 |571.1991590073529 |510.995849609375 |731.9383300781251 |747.6568359375 |757.304716796875 |20.56117272377014 |20.230452219645183|
|ResNet_v2_50 |12 |16 |1.5 |6567 |76.2222 |189.6414991568285 |252 |987.7153136238219 |926.00390625 |1080.99130859375 |1249.4956298828126 |1434.4802392578124 |21.26116919517517 |20.74245794614156|
|ResNet_v2_50 |12 |32 |2.8 |6567 |84.25 |203.868228281784 |138 |1811.640237559443 |1764.2760009765625 |1855.28046875 |2023.56826171875 |2586.8038134765625 |21.66105055809021 |20.834286351998646|
|ResNet_v2_50 |16 |1 |4.8 |6567 |94.3333 |116.34927733312234 |2347 |136.7957122373642 |135.959716796875 |144.1568359375 |146.105517578125 |175.05707519531248 |20.172020435333252 |20.067057371139526|
|ResNet_v2_50 |16 |4 |15.4 |6567 |83.6364 |160.59012047270738 |822 |393.3079394412447 |396.446533203125 |426.272216796875 |429.777734375 |564.1119360351562 |20.47448492050171 |20.206754431128502|
|ResNet_v2_50 |16 |8 |6.8 |6567 |81.0233 |169.95774070621547 |437 |741.5512622684854 |751.521484375 |763.199169921875 |948.8041992187501 |1001.156142578125 |20.56981921195984 |20.254074171185493|
|ResNet_v2_50 |16 |16 |3.5 |6567 |77.8706 |186.56600081516 |248 |1332.1007946383568 |1365.2745361328125 |1399.212255859375 |1432.4037353515625 |1771.4374853515626 |21.26861262321472 |20.64799252152443|
|ResNet_v2_50 |16 |32 |4.3 |6567 |83.6371 |201.1293408638195 |140 |2419.3400198800223 |2561.09228515625 |2616.081103515625 |2642.0835205078124 |2883.8197412109366 |22.274224042892456 |21.169659316539764|
# 稀疏参数索引服务 Cube
在稀疏参数索引场景,如推荐、广告系统中通常会使用大规模 Embedding 表。由于在工业级的场景中,稀疏参数的规模非常大,达到 10^9 数量级。因此在一台机器上启动大规模稀疏参数预测是不实际的,因此我们引入百度多年来在稀疏参数索引领域的工业级产品 Cube,提供分布式的稀疏参数服务。
## Cube 工作原理
本章节介绍了 Cube 的基本使用方法和工作原理。请参考[Cube 架构]()
## Cube 编译安装
本章节介绍了 Cube 各个组件的编译以及安装方法。请参考[Cube 编译安装]()
## Cube 基础功能
本章节介绍了 Cube 的基础功能及使用方法。请参考[Cube 基础功能]()
## Cube 进阶功能
本章节介绍了 Cube 的高级功能使用方法。请参考[Cube 进阶功能]()
## 在 K8S 上使用 Cube
本章节介绍了在 K8S 平台上使用 Cube 的方法。请参考[在 K8S 上使用 Cube]()
## Cube 部署示例
本章节介绍了 Cube 的一个部署示例。请参考[Cube 部署示例]()
\ No newline at end of file
# 稀疏参数索引服务 Cube
在稀疏参数索引场景,如推荐、广告系统中通常会使用大规模 Embedding 表。由于在工业级的场景中,稀疏参数的规模非常大,达到 10^9 数量级。因此在一台机器上启动大规模稀疏参数预测是不实际的,因此我们引入百度多年来在稀疏参数索引领域的工业级产品 Cube,用于部署大规模的稀疏参数模型,支持模型的分布式管理和快速更新,并且支持 Paddle Serving 进行低延迟的批量访问。
<img src="images/8-1_Cube_Architecture_CN_1.png">
## Cube 组件介绍
**一. cube-builder**
cube-builder 是把模型生成分片文件和版本管理的工具。由于稀疏参数文件往往是一个大文件,需要使用哈希函数将其分割为不同的分片,并使用分布式当中的每一个节点去加载不同的分片。与此同时,工业级的场景需要支持定期模型的配送和流式训练,因此对于模型的版本管理十分重要,这也是在训练保存模型时缺失的部分,因此 cube-builder 在生成分片的同时,也可以人为指定增加版本信息。
**二. cube-transfer**
cube-transfer 是调度管理服务。一方面 cube-transfer 会监测上游模型,当模型更新时进行模型下载。另一方面,会调用 cube-builder 将下载好的模型进行分片。而后与 cube-agent 进行对接完成分片文件配送。
**三. cube-agent**
cube-agent 是与cube-transfer 配套使用的调度管理服务。cube-agent 会接收来自 cube-transfer 传输来的分片文件。而后发送信号给 cube-server 对应接口完成配送操作。
**四. cube-server**
cube-server 基于 Cube 的 KV 能力,对外提供稀疏参数服务。它通过 brpc 提供高性能分布式查询服务,并支持 RestAPI 来进行远端调用。
**五. cube-cli**
cube-cli 是 cube-server 的客户端,用于请求 cube-server 进行对应稀疏参数查询功能。这部分组件已经被整合到 paddle serving 当中,当我们准备好 cube.conf 配置文件并在 server 的代码中指定kv_infer 相关的 op 时,cube-cli 就会在 server 端准备就绪。
## 配送过程
一次完整的配送流程如下:
- 将训练好的模型存放到 FileServer 中,并在传输完成后生成完成标志,这里 FileServer 可以使用 http 协议的文件传输服务;
- cube-transfer 监测到完成标志后,从 FileServer 中下载对应模型文件;
- cube-transfer 使用 cube-builder 工具对稀疏参数模型进行分片;
- cube-transfer 向 cube-agent 进行文件配送;
- cube-agent 向 cube-server 发送加载命令,通知 cube-server 热加载新的参数文件;
- cube-server 响应 Paddle Serving 发送的查询请求。
\ No newline at end of file
# Cube 编译
## 编译依赖
**以下是主要组件及其编译依赖**
| 组件 | 说明 |
| :--------------------------: | :-------------------------------: |
| Cube-Server | C++程序,提供高效快速的 RPC 协议 |
| Cube-Agent | Go 程序,需要 Go 环境支持 |
| Cube-Transfer | Go 程序,需要 Go 环境支持 |
| Cube-Builder | C++程序 |
| Cube-Cli | C++组件,已集成进 C++ server 中,不需单独编译 |
## 编译方法
推荐使用 Docker 编译,我们已经为您准备好了编译环境并配置好了上述编译依赖,详见[镜像环境]()。
**一. 设置 PYTHON 环境变量**
请按照如下,确定好需要编译的 Python 版本,设置对应的环境变量,一共需要设置三个环境变量,分别是 `PYTHON_INCLUDE_DIR`, `PYTHON_LIBRARIES`, `PYTHON_EXECUTABLE`。以下我们以 python 3.7为例,介绍如何设置这三个环境变量。
```
# 请自行修改至自身路径
export PYTHON_INCLUDE_DIR=/usr/local/include/python3.7m/
export PYTHON_LIBRARIES=/usr/local/lib/x86_64-linux-gnu/libpython3.7m.so
export PYTHON_EXECUTABLE=/usr/local/bin/python3.7
export GOPATH=$HOME/go
export PATH=$PATH:$GOPATH/bin
python3.7 -m pip install -r python/requirements.txt
go env -w GO111MODULE=on
go env -w GOPROXY=https://goproxy.cn,direct
go install github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
go install github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
go install github.com/golang/protobuf/protoc-gen-go@v1.4.3
go install google.golang.org/grpc@v1.33.0
go env -w GO111MODULE=auto
```
环境变量的含义如下表所示。
| cmake 环境变量 | 含义 | 注意事项 | Docker 环境是否需要 |
|-----------------------|-------------------------------------|-------------------------------|--------------------|
| PYTHON_INCLUDE_DIR | Python.h 所在的目录,通常为 **/include/python3.7/Python.h | 如果没有找到。说明 1)没有安装开发版本的 Python,需重新安装 2)权限不足无法查看相关系统目录。 | 是(/usr/local/include/python3.7) |
| PYTHON_LIBRARIES | libpython3.7.so 或 libpython3.7m.so 所在目录,通常为 /usr/local/lib | 如果没有找到。说明 1)没有安装开发版本的 Python,需重新安装 2)权限不足无法查看相关系统目录。 | 是(/usr/local/lib/x86_64-linux-gnu/libpython3.7m.so) |
| PYTHON_EXECUTABLE | python3.7 所在目录,通常为 /usr/local/bin | | 是(/usr/local/bin/python3.7) |
**二. 编译**
```
mkdir build_cube
cd build_cube
cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
-DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-DSERVER=ON \
-DWITH_GPU=OFF ..
make -j
cd ..
```
最终我们会在`build_cube/core/cube`目录下看到 Cube 组件已经编译完成,其中:
- Cube-Server:build_cube/core/cube/cube-server/cube
- Cube-Agent:build_cube/core/cube/cube-agent/src/cube-agent
- Cube-Transfer:build_cube/core/cube/cube-transfer/src/cube-transfer
- Cube-Builder:build_cube/core/cube/cube-builder/cube-builder
\ No newline at end of file
......@@ -3,6 +3,5 @@ Kubernetes 集群部署
服务部署经历从物理机、虚拟机、容器化、云原生4个阶段。云原生,提供集装箱组合模式的乐高生态,Docker、Kubernetes 已称为云原生时代基础设施,推动应用程序大发展。Kubernetes 的可扩展性和分布式架构一直是人工智能和机器学习的绝佳选择,随着解决方案不断成熟,推动机器学习大规模工程落地。
本章节介绍 Kubernetes 上集群化部署 Paddle Serving 方案以及企业级安全网关部署案例。
- [Kubernetes 集群部署方案]()
- [Kubernetes 集群部署方案](./9-1_Kubernetes_CN.md)
- [Kubernetes 安全网关部署案例]()
# Paddle Serving - 端到端服务化推理框架
## 1.Paddle Serving 介绍
面向模型服务化部署场景的端到端服务化推理框架 Paddle Serving,可以实现飞桨模型在 X86、ARM 平台多种硬件上高性能服务化部署,支持5种以上的 GPU、NPU 硬件推理加速;此外,Paddle Serving 提供 Docker 和 Kubernetes 的云端部署方案。
## 2.快速上手-代码模块
进入到 Serving 的 git 目录下,进入到 [fit_a_line](https://github.com/PaddlePaddle/Serving/tree/v0.8.3/examples/C%2B%2B/fit_a_line) 示例
```
## 下载模型
sh get_data.sh
## 启动服务
python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
## HTTP curl
curl -XPOST http://0.0.0.0:9393/GeneralModelService/inference -d ' {"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"name":"x","alias_name":"x","shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}'
```
## 3.部署流程图
开发流程:①准备部署环境;②准备部署模型;③Serving程序开发;④服务启动与优化
**①准备部署环境**
docker 是一个开源的应用容器引擎,可以让应用程序更加方便地被打包和移植。Paddle Serving 容器化部署建议在 docker 中进行Serving服务化部署。在 Serving Docker 环境中安装 PYTHON Wheel 包
**②准备部署模型**
下载推理模型后,为了便于模型服务化部署,需要将推理模型保存成用于 Serving 部署的参数形式
**③Serving程序开发**
修改服务端和客户端代码适配模型的前后处理,通过修改配置或命令行参数,如端口、指定硬件和并发数量等指定部署参数。
**④服务启动与优化**
命令方式启动服务端和客户端,根据输出结果和性能指标做进一步的性能优化。
## 4.Demo 展示区
参考 [模型库](./4-0_ModelZoo_CN.md)
## 5.核心优势
Paddle Serving 具备工业级功能、高性能等优势。
**一.工业级**
- 支持 HTTP、gRPC、bRPC 等多种协议;提供 C++、Python、Java 语言 SDK
- 设计并实现基于有向无环图(DAG)的异步流水线高性能推理框架,具有多模型组合、异步调度、并发推理、动态批量、多卡多流推理、请求缓存等特性
- 适配 x86(Intel) CPU、ARM CPU、Nvidia GPU、昆仑 XPU、华为昇腾310/910、海光 DCU、Nvidia Jetson 等多种硬件
- 集成 Intel MKLDNN、Nvidia TensorRT 加速库,以及低精度和量化推理
- 提供一套模型安全部署解决方案,包括加密模型部署、鉴权校验、HTTPs 安全网关,并在实际项目中应用
- 支持云端部署,提供百度云智能云 kubernetes 集群部署 Paddle Serving 案例
- 提供丰富的经典模型部署示例,如 PaddleOCR、PaddleClas、PaddleDetection、PaddleSeg、PaddleNLP 和 PaddleRec等套件,共计40多个预训练精品模型
**二.高性能**
# 1. 测试环境和说明
1) GPU型号:Tesla P4(7611 Mib)
2) Cuda版本:11.0
3) 模型:ResNet_v2_50
4) 为了测试异步合并batch的效果,测试数据中batch=1
5) [使用的测试代码和使用的数据集](../../examples/C++/PaddleClas/resnet_v2_50)
6) 下图中蓝色是C++ Serving,灰色为TF-Serving。
7) 折线图为QPS,数值越大表示每秒钟处理的请求数量越大,性能就越好。
8) 柱状图为平均处理时延,数值越大表示单个请求处理时间越长,性能就越差。
同步模型默认参数配置情况下,C++ Serving QPS 和平均时延指标均优于 TF-Serving。
<p align="center">
<br>
<img src='../images/syn_benchmark.png'>
<br>
<p>
异步模式情况下,两者性能接近,但当 Client 并发数达到70的时候,TF-Serving 服务直接超时,而 C++ Serving 能够正常返回结果。
<p align="center">
<br>
<img src='../images/asyn_benchmark.png'>
<br>
<p>
## 6.合作案例
## 7.资源汇总
## 8.开发者贡献&社区
......@@ -33,6 +33,7 @@ python -m paddle_serving_server.serve \
**二. C++ Serving 设置动态 shape**
1. 方法一:
`**/paddle_inference/paddle/include/paddle_engine.h` 修改如下代码
```
......@@ -127,6 +128,55 @@ python -m paddle_serving_server.serve \
}
```
2. 方法二:
`**/python/paddle_serving_server/serve.py` 参考如下代码生成配置信息,
并使用`server.set_trt_dynamic_shape_info(info)`方法进行设置
```
def set_ocr_dynamic_shape_info():
info = []
min_input_shape = {
"x": [1, 3, 50, 50],
"conv2d_182.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_2.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_3.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_4.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_5.tmp_0": [1, 1, 20, 20]
}
max_input_shape = {
"x": [1, 3, 1536, 1536],
"conv2d_182.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_2.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_3.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_4.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_5.tmp_0": [20, 200, 960, 960],
}
opt_input_shape = {
"x": [1, 3, 960, 960],
"conv2d_182.tmp_0": [3, 96, 240, 240],
"nearest_interp_v2_2.tmp_0": [3, 96, 240, 240],
"nearest_interp_v2_3.tmp_0": [3, 24, 240, 240],
"nearest_interp_v2_4.tmp_0": [3, 24, 240, 240],
"nearest_interp_v2_5.tmp_0": [3, 24, 240, 240],
}
det_info = {
"min_input_shape": min_input_shape,
"max_input_shape": max_input_shape,
"opt_input_shape": opt_input_shape,
}
info.append(det_info)
min_input_shape = {"x": [1, 3, 32, 10], "lstm_1.tmp_0": [1, 1, 128]}
max_input_shape = {"x": [50, 3, 32, 1000], "lstm_1.tmp_0": [500, 50, 128]}
opt_input_shape = {"x": [6, 3, 32, 100], "lstm_1.tmp_0": [25, 5, 128]}
rec_info = {
"min_input_shape": min_input_shape,
"max_input_shape": max_input_shape,
"opt_input_shape": opt_input_shape,
}
info.append(rec_info)
return info
```
## Pipeline Serving
......
......@@ -16,6 +16,8 @@ The following is the dynamic shape api
For detail, please refer to API doc [C++](https://paddleinference.paddlepaddle.org.cn/api_reference/cxx_api_doc/Config/GPUConfig.html#tensorrt)/[Python](https://paddleinference.paddlepaddle.org.cn/api_reference/python_api_doc/Config/GPUConfig.html#tensorrt)
### C++ Serving
1. Method 1:
Modify the following code in `**/paddle_inference/paddle/include/paddle_engine.h`
```
......@@ -110,6 +112,54 @@ Modify the following code in `**/paddle_inference/paddle/include/paddle_engine.h
}
```
2. Method 2:
Refer to the code of `**/python/paddle_serving_server/serve.py` below to generate the configuration information,
and using method `server.set_trt_dynamic_shape_info(info)` to set information.
```
def set_ocr_dynamic_shape_info():
info = []
min_input_shape = {
"x": [1, 3, 50, 50],
"conv2d_182.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_2.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_3.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_4.tmp_0": [1, 1, 20, 20],
"nearest_interp_v2_5.tmp_0": [1, 1, 20, 20]
}
max_input_shape = {
"x": [1, 3, 1536, 1536],
"conv2d_182.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_2.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_3.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_4.tmp_0": [20, 200, 960, 960],
"nearest_interp_v2_5.tmp_0": [20, 200, 960, 960],
}
opt_input_shape = {
"x": [1, 3, 960, 960],
"conv2d_182.tmp_0": [3, 96, 240, 240],
"nearest_interp_v2_2.tmp_0": [3, 96, 240, 240],
"nearest_interp_v2_3.tmp_0": [3, 24, 240, 240],
"nearest_interp_v2_4.tmp_0": [3, 24, 240, 240],
"nearest_interp_v2_5.tmp_0": [3, 24, 240, 240],
}
det_info = {
"min_input_shape": min_input_shape,
"max_input_shape": max_input_shape,
"opt_input_shape": opt_input_shape,
}
info.append(det_info)
min_input_shape = {"x": [1, 3, 32, 10], "lstm_1.tmp_0": [1, 1, 128]}
max_input_shape = {"x": [50, 3, 32, 1000], "lstm_1.tmp_0": [500, 50, 128]}
opt_input_shape = {"x": [6, 3, 32, 100], "lstm_1.tmp_0": [25, 5, 128]}
rec_info = {
"min_input_shape": min_input_shape,
"max_input_shape": max_input_shape,
"opt_input_shape": opt_input_shape,
}
info.append(rec_info)
return info
```
### Pipeline Serving
......
doc/images/wechat_group_1.jpeg

119.1 KB | W: | H:

doc/images/wechat_group_1.jpeg

120.0 KB | W: | H:

doc/images/wechat_group_1.jpeg
doc/images/wechat_group_1.jpeg
doc/images/wechat_group_1.jpeg
doc/images/wechat_group_1.jpeg
  • 2-up
  • Swipe
  • Onion skin
# 图像分类
## 1.获取模型
```
wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/ResNet50_vd_infer.tar && tar xf ResNet50_vd_infer.tar
```
## 2.用 paddle_serving_client 把下载的推理模型保存用于 Serving 部署的模型参数
```
# 保存 ResNet50_vd 模型参数
python3 -m paddle_serving_client.convert --dirname ./ResNet50_vd_infer/ \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
--serving_server ./ResNet50_vd_serving/ \
--serving_client ./ResNet50_vd_client/
```
会在当前文件夹多出 `ResNet50_vd_serving``ResNet50_vd_client` 的文件夹
保存参数后,会在当前文件夹多出 `ResNet50_vd_serving``ResNet50_vd_client` 的文件夹:
```
├── daisy.jpg
├── http_client.py
├── imagenet.label
├── ResNet50_vd_client
│   ├── serving_client_conf.prototxt
│   └── serving_client_conf.stream.prototxt
├── ResNet50_vd_infer
│   ├── inference.pdiparams
│   ├── inference.pdiparams.info
│   └── inference.pdmodel
├── ResNet50_vd_serving
│   ├── fluid_time_file
│   ├── inference.pdiparams
│   ├── inference.pdmodel
│   ├── serving_server_conf.prototxt
│   └── serving_server_conf.stream.prototxt
├── rpc_client.py
```
**三.启动服务**
C++ Serving 服务可以指定一个网络端口同时接收 HTTP、gRPC 和 bRPC 请求。命令参数 `--model` 指定模型路径,`--gpu_ids` 指定 GPU 卡,`--port` 指定端口。
```
python3 -m paddle_serving_server.serve --model ResNet50_vd_serving --gpu_ids 0 --port 9394
```
**四.启动客户端**
1. `rpc_client.py` 封装了 HTTP 请求客户端
```
python3 http_client.py
```
2. `http_client.py` 封装了 gRPC 请求客户端
```
python3 rpc_client.py
```
成功运行后,模型预测的结果会打印如下:
```
prediction: daisy, probability: 0.9341399073600769
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from paddle_serving_client import HttpClient
#app
from paddle_serving_app.reader import Sequential, URL2Image, Resize
from paddle_serving_app.reader import CenterCrop, RGB2BGR, Transpose, Div, Normalize
import time
client = HttpClient()
client.load_client_config("./ResNet50_vd_client/serving_client_conf.prototxt")
'''
if you want use GRPC-client, set_use_grpc_client(True)
or you can directly use client.grpc_client_predict(...)
as for HTTP-client,set_use_grpc_client(False)(which is default)
or you can directly use client.http_client_predict(...)
'''
#client.set_use_grpc_client(True)
'''
if you want to enable Encrypt Module,uncommenting the following line
'''
#client.use_key("./key")
'''
if you want to compress,uncommenting the following line
'''
#client.set_response_compress(True)
#client.set_request_compress(True)
'''
we recommend use Proto data format in HTTP-body, set True(which is default)
if you want use JSON data format in HTTP-body, set False
'''
#client.set_http_proto(True)
client.connect(["127.0.0.1:9394"])
label_dict = {}
label_idx = 0
with open("imagenet.label") as fin:
for line in fin:
label_dict[label_idx] = line.strip()
label_idx += 1
#preprocess
seq = Sequential([
URL2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose((2, 0, 1)),
Div(255), Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True)
])
start = time.time()
image_file = "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"
for i in range(1):
img = seq(image_file)
res = client.predict(feed={"inputs": img}, fetch=[], batch=False)
if res is None:
raise ValueError("predict error")
if res.err_no != 0:
raise ValueError("predict error. Response : {}".format(res))
max_val = res.outputs[0].tensor[0].float_data[0]
max_idx = 0
for one_data in enumerate(res.outputs[0].tensor[0].float_data):
if one_data[1] > max_val:
max_val = one_data[1]
max_idx = one_data[0]
label = label_dict[max_idx].strip().replace(",", "")
print("prediction: {}, probability: {}".format(label, max_val))
end = time.time()
print(end - start)
tench, Tinca tinca,
goldfish, Carassius auratus,
great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias,
tiger shark, Galeocerdo cuvieri,
hammerhead, hammerhead shark,
electric ray, crampfish, numbfish, torpedo,
stingray,
cock,
hen,
ostrich, Struthio camelus,
brambling, Fringilla montifringilla,
goldfinch, Carduelis carduelis,
house finch, linnet, Carpodacus mexicanus,
junco, snowbird,
indigo bunting, indigo finch, indigo bird, Passerina cyanea,
robin, American robin, Turdus migratorius,
bulbul,
jay,
magpie,
chickadee,
water ouzel, dipper,
kite,
bald eagle, American eagle, Haliaeetus leucocephalus,
vulture,
great grey owl, great gray owl, Strix nebulosa,
European fire salamander, Salamandra salamandra,
common newt, Triturus vulgaris,
eft,
spotted salamander, Ambystoma maculatum,
axolotl, mud puppy, Ambystoma mexicanum,
bullfrog, Rana catesbeiana,
tree frog, tree-frog,
tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui,
loggerhead, loggerhead turtle, Caretta caretta,
leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea,
mud turtle,
terrapin,
box turtle, box tortoise,
banded gecko,
common iguana, iguana, Iguana iguana,
American chameleon, anole, Anolis carolinensis,
whiptail, whiptail lizard,
agama,
frilled lizard, Chlamydosaurus kingi,
alligator lizard,
Gila monster, Heloderma suspectum,
green lizard, Lacerta viridis,
African chameleon, Chamaeleo chamaeleon,
Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis,
African crocodile, Nile crocodile, Crocodylus niloticus,
American alligator, Alligator mississipiensis,
triceratops,
thunder snake, worm snake, Carphophis amoenus,
ringneck snake, ring-necked snake, ring snake,
hognose snake, puff adder, sand viper,
green snake, grass snake,
king snake, kingsnake,
garter snake, grass snake,
water snake,
vine snake,
night snake, Hypsiglena torquata,
boa constrictor, Constrictor constrictor,
rock python, rock snake, Python sebae,
Indian cobra, Naja naja,
green mamba,
sea snake,
horned viper, cerastes, sand viper, horned asp, Cerastes cornutus,
diamondback, diamondback rattlesnake, Crotalus adamanteus,
sidewinder, horned rattlesnake, Crotalus cerastes,
trilobite,
harvestman, daddy longlegs, Phalangium opilio,
scorpion,
black and gold garden spider, Argiope aurantia,
barn spider, Araneus cavaticus,
garden spider, Aranea diademata,
black widow, Latrodectus mactans,
tarantula,
wolf spider, hunting spider,
tick,
centipede,
black grouse,
ptarmigan,
ruffed grouse, partridge, Bonasa umbellus,
prairie chicken, prairie grouse, prairie fowl,
peacock,
quail,
partridge,
African grey, African gray, Psittacus erithacus,
macaw,
sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita,
lorikeet,
coucal,
bee eater,
hornbill,
hummingbird,
jacamar,
toucan,
drake,
red-breasted merganser, Mergus serrator,
goose,
black swan, Cygnus atratus,
tusker,
echidna, spiny anteater, anteater,
platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus,
wallaby, brush kangaroo,
koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus,
wombat,
jellyfish,
sea anemone, anemone,
brain coral,
flatworm, platyhelminth,
nematode, nematode worm, roundworm,
conch,
snail,
slug,
sea slug, nudibranch,
chiton, coat-of-mail shell, sea cradle, polyplacophore,
chambered nautilus, pearly nautilus, nautilus,
Dungeness crab, Cancer magister,
rock crab, Cancer irroratus,
fiddler crab,
king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica,
American lobster, Northern lobster, Maine lobster, Homarus americanus,
spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish,
crayfish, crawfish, crawdad, crawdaddy,
hermit crab,
isopod,
white stork, Ciconia ciconia,
black stork, Ciconia nigra,
spoonbill,
flamingo,
little blue heron, Egretta caerulea,
American egret, great white heron, Egretta albus,
bittern,
crane,
limpkin, Aramus pictus,
European gallinule, Porphyrio porphyrio,
American coot, marsh hen, mud hen, water hen, Fulica americana,
bustard,
ruddy turnstone, Arenaria interpres,
red-backed sandpiper, dunlin, Erolia alpina,
redshank, Tringa totanus,
dowitcher,
oystercatcher, oyster catcher,
pelican,
king penguin, Aptenodytes patagonica,
albatross, mollymawk,
grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus,
killer whale, killer, orca, grampus, sea wolf, Orcinus orca,
dugong, Dugong dugon,
sea lion,
Chihuahua,
Japanese spaniel,
Maltese dog, Maltese terrier, Maltese,
Pekinese, Pekingese, Peke,
Shih-Tzu,
Blenheim spaniel,
papillon,
toy terrier,
Rhodesian ridgeback,
Afghan hound, Afghan,
basset, basset hound,
beagle,
bloodhound, sleuthhound,
bluetick,
black-and-tan coonhound,
Walker hound, Walker foxhound,
English foxhound,
redbone,
borzoi, Russian wolfhound,
Irish wolfhound,
Italian greyhound,
whippet,
Ibizan hound, Ibizan Podenco,
Norwegian elkhound, elkhound,
otterhound, otter hound,
Saluki, gazelle hound,
Scottish deerhound, deerhound,
Weimaraner,
Staffordshire bullterrier, Staffordshire bull terrier,
American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier,
Bedlington terrier,
Border terrier,
Kerry blue terrier,
Irish terrier,
Norfolk terrier,
Norwich terrier,
Yorkshire terrier,
wire-haired fox terrier,
Lakeland terrier,
Sealyham terrier, Sealyham,
Airedale, Airedale terrier,
cairn, cairn terrier,
Australian terrier,
Dandie Dinmont, Dandie Dinmont terrier,
Boston bull, Boston terrier,
miniature schnauzer,
giant schnauzer,
standard schnauzer,
Scotch terrier, Scottish terrier, Scottie,
Tibetan terrier, chrysanthemum dog,
silky terrier, Sydney silky,
soft-coated wheaten terrier,
West Highland white terrier,
Lhasa, Lhasa apso,
flat-coated retriever,
curly-coated retriever,
golden retriever,
Labrador retriever,
Chesapeake Bay retriever,
German short-haired pointer,
vizsla, Hungarian pointer,
English setter,
Irish setter, red setter,
Gordon setter,
Brittany spaniel,
clumber, clumber spaniel,
English springer, English springer spaniel,
Welsh springer spaniel,
cocker spaniel, English cocker spaniel, cocker,
Sussex spaniel,
Irish water spaniel,
kuvasz,
schipperke,
groenendael,
malinois,
briard,
kelpie,
komondor,
Old English sheepdog, bobtail,
Shetland sheepdog, Shetland sheep dog, Shetland,
collie,
Border collie,
Bouvier des Flandres, Bouviers des Flandres,
Rottweiler,
German shepherd, German shepherd dog, German police dog, alsatian,
Doberman, Doberman pinscher,
miniature pinscher,
Greater Swiss Mountain dog,
Bernese mountain dog,
Appenzeller,
EntleBucher,
boxer,
bull mastiff,
Tibetan mastiff,
French bulldog,
Great Dane,
Saint Bernard, St Bernard,
Eskimo dog, husky,
malamute, malemute, Alaskan malamute,
Siberian husky,
dalmatian, coach dog, carriage dog,
affenpinscher, monkey pinscher, monkey dog,
basenji,
pug, pug-dog,
Leonberg,
Newfoundland, Newfoundland dog,
Great Pyrenees,
Samoyed, Samoyede,
Pomeranian,
chow, chow chow,
keeshond,
Brabancon griffon,
Pembroke, Pembroke Welsh corgi,
Cardigan, Cardigan Welsh corgi,
toy poodle,
miniature poodle,
standard poodle,
Mexican hairless,
timber wolf, grey wolf, gray wolf, Canis lupus,
white wolf, Arctic wolf, Canis lupus tundrarum,
red wolf, maned wolf, Canis rufus, Canis niger,
coyote, prairie wolf, brush wolf, Canis latrans,
dingo, warrigal, warragal, Canis dingo,
dhole, Cuon alpinus,
African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus,
hyena, hyaena,
red fox, Vulpes vulpes,
kit fox, Vulpes macrotis,
Arctic fox, white fox, Alopex lagopus,
grey fox, gray fox, Urocyon cinereoargenteus,
tabby, tabby cat,
tiger cat,
Persian cat,
Siamese cat, Siamese,
Egyptian cat,
cougar, puma, catamount, mountain lion, painter, panther, Felis concolor,
lynx, catamount,
leopard, Panthera pardus,
snow leopard, ounce, Panthera uncia,
jaguar, panther, Panthera onca, Felis onca,
lion, king of beasts, Panthera leo,
tiger, Panthera tigris,
cheetah, chetah, Acinonyx jubatus,
brown bear, bruin, Ursus arctos,
American black bear, black bear, Ursus americanus, Euarctos americanus,
ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus,
sloth bear, Melursus ursinus, Ursus ursinus,
mongoose,
meerkat, mierkat,
tiger beetle,
ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle,
ground beetle, carabid beetle,
long-horned beetle, longicorn, longicorn beetle,
leaf beetle, chrysomelid,
dung beetle,
rhinoceros beetle,
weevil,
fly,
bee,
ant, emmet, pismire,
grasshopper, hopper,
cricket,
walking stick, walkingstick, stick insect,
cockroach, roach,
mantis, mantid,
cicada, cicala,
leafhopper,
lacewing, lacewing fly,
"dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
damselfly,
admiral,
ringlet, ringlet butterfly,
monarch, monarch butterfly, milkweed butterfly, Danaus plexippus,
cabbage butterfly,
sulphur butterfly, sulfur butterfly,
lycaenid, lycaenid butterfly,
starfish, sea star,
sea urchin,
sea cucumber, holothurian,
wood rabbit, cottontail, cottontail rabbit,
hare,
Angora, Angora rabbit,
hamster,
porcupine, hedgehog,
fox squirrel, eastern fox squirrel, Sciurus niger,
marmot,
beaver,
guinea pig, Cavia cobaya,
sorrel,
zebra,
hog, pig, grunter, squealer, Sus scrofa,
wild boar, boar, Sus scrofa,
warthog,
hippopotamus, hippo, river horse, Hippopotamus amphibius,
ox,
water buffalo, water ox, Asiatic buffalo, Bubalus bubalis,
bison,
ram, tup,
bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis,
ibex, Capra ibex,
hartebeest,
impala, Aepyceros melampus,
gazelle,
Arabian camel, dromedary, Camelus dromedarius,
llama,
weasel,
mink,
polecat, fitch, foulmart, foumart, Mustela putorius,
black-footed ferret, ferret, Mustela nigripes,
otter,
skunk, polecat, wood pussy,
badger,
armadillo,
three-toed sloth, ai, Bradypus tridactylus,
orangutan, orang, orangutang, Pongo pygmaeus,
gorilla, Gorilla gorilla,
chimpanzee, chimp, Pan troglodytes,
gibbon, Hylobates lar,
siamang, Hylobates syndactylus, Symphalangus syndactylus,
guenon, guenon monkey,
patas, hussar monkey, Erythrocebus patas,
baboon,
macaque,
langur,
colobus, colobus monkey,
proboscis monkey, Nasalis larvatus,
marmoset,
capuchin, ringtail, Cebus capucinus,
howler monkey, howler,
titi, titi monkey,
spider monkey, Ateles geoffroyi,
squirrel monkey, Saimiri sciureus,
Madagascar cat, ring-tailed lemur, Lemur catta,
indri, indris, Indri indri, Indri brevicaudatus,
Indian elephant, Elephas maximus,
African elephant, Loxodonta africana,
lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens,
giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca,
barracouta, snoek,
eel,
coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch,
rock beauty, Holocanthus tricolor,
anemone fish,
sturgeon,
gar, garfish, garpike, billfish, Lepisosteus osseus,
lionfish,
puffer, pufferfish, blowfish, globefish,
abacus,
abaya,
"academic gown, academic robe, judges robe",
accordion, piano accordion, squeeze box,
acoustic guitar,
aircraft carrier, carrier, flattop, attack aircraft carrier,
airliner,
airship, dirigible,
altar,
ambulance,
amphibian, amphibious vehicle,
analog clock,
apiary, bee house,
apron,
ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin,
assault rifle, assault gun,
backpack, back pack, knapsack, packsack, rucksack, haversack,
bakery, bakeshop, bakehouse,
balance beam, beam,
balloon,
ballpoint, ballpoint pen, ballpen, Biro,
Band Aid,
banjo,
bannister, banister, balustrade, balusters, handrail,
barbell,
barber chair,
barbershop,
barn,
barometer,
barrel, cask,
barrow, garden cart, lawn cart, wheelbarrow,
baseball,
basketball,
bassinet,
bassoon,
bathing cap, swimming cap,
bath towel,
bathtub, bathing tub, bath, tub,
beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon,
beacon, lighthouse, beacon light, pharos,
beaker,
bearskin, busby, shako,
beer bottle,
beer glass,
bell cote, bell cot,
bib,
bicycle-built-for-two, tandem bicycle, tandem,
bikini, two-piece,
binder, ring-binder,
binoculars, field glasses, opera glasses,
birdhouse,
boathouse,
bobsled, bobsleigh, bob,
bolo tie, bolo, bola tie, bola,
bonnet, poke bonnet,
bookcase,
bookshop, bookstore, bookstall,
bottlecap,
bow,
bow tie, bow-tie, bowtie,
brass, memorial tablet, plaque,
brassiere, bra, bandeau,
breakwater, groin, groyne, mole, bulwark, seawall, jetty,
breastplate, aegis, egis,
broom,
bucket, pail,
buckle,
bulletproof vest,
bullet train, bullet,
butcher shop, meat market,
cab, hack, taxi, taxicab,
caldron, cauldron,
candle, taper, wax light,
cannon,
canoe,
can opener, tin opener,
cardigan,
car mirror,
carousel, carrousel, merry-go-round, roundabout, whirligig,
"carpenters kit, tool kit",
carton,
car wheel,
cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM,
cassette,
cassette player,
castle,
catamaran,
CD player,
cello, violoncello,
cellular telephone, cellular phone, cellphone, cell, mobile phone,
chain,
chainlink fence,
chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour,
chain saw, chainsaw,
chest,
chiffonier, commode,
chime, bell, gong,
china cabinet, china closet,
Christmas stocking,
church, church building,
cinema, movie theater, movie theatre, movie house, picture palace,
cleaver, meat cleaver, chopper,
cliff dwelling,
cloak,
clog, geta, patten, sabot,
cocktail shaker,
coffee mug,
coffeepot,
coil, spiral, volute, whorl, helix,
combination lock,
computer keyboard, keypad,
confectionery, confectionary, candy store,
container ship, containership, container vessel,
convertible,
corkscrew, bottle screw,
cornet, horn, trumpet, trump,
cowboy boot,
cowboy hat, ten-gallon hat,
cradle,
crane,
crash helmet,
crate,
crib, cot,
Crock Pot,
croquet ball,
crutch,
cuirass,
dam, dike, dyke,
desk,
desktop computer,
dial telephone, dial phone,
diaper, nappy, napkin,
digital clock,
digital watch,
dining table, board,
dishrag, dishcloth,
dishwasher, dish washer, dishwashing machine,
disk brake, disc brake,
dock, dockage, docking facility,
dogsled, dog sled, dog sleigh,
dome,
doormat, welcome mat,
drilling platform, offshore rig,
drum, membranophone, tympan,
drumstick,
dumbbell,
Dutch oven,
electric fan, blower,
electric guitar,
electric locomotive,
entertainment center,
envelope,
espresso maker,
face powder,
feather boa, boa,
file, file cabinet, filing cabinet,
fireboat,
fire engine, fire truck,
fire screen, fireguard,
flagpole, flagstaff,
flute, transverse flute,
folding chair,
football helmet,
forklift,
fountain,
fountain pen,
four-poster,
freight car,
French horn, horn,
frying pan, frypan, skillet,
fur coat,
garbage truck, dustcart,
gasmask, respirator, gas helmet,
gas pump, gasoline pump, petrol pump, island dispenser,
goblet,
go-kart,
golf ball,
golfcart, golf cart,
gondola,
gong, tam-tam,
gown,
grand piano, grand,
greenhouse, nursery, glasshouse,
grille, radiator grille,
grocery store, grocery, food market, market,
guillotine,
hair slide,
hair spray,
half track,
hammer,
hamper,
hand blower, blow dryer, blow drier, hair dryer, hair drier,
hand-held computer, hand-held microcomputer,
handkerchief, hankie, hanky, hankey,
hard disc, hard disk, fixed disk,
harmonica, mouth organ, harp, mouth harp,
harp,
harvester, reaper,
hatchet,
holster,
home theater, home theatre,
honeycomb,
hook, claw,
hoopskirt, crinoline,
horizontal bar, high bar,
horse cart, horse-cart,
hourglass,
iPod,
iron, smoothing iron,
"jack-o-lantern",
jean, blue jean, denim,
jeep, landrover,
jersey, T-shirt, tee shirt,
jigsaw puzzle,
jinrikisha, ricksha, rickshaw,
joystick,
kimono,
knee pad,
knot,
lab coat, laboratory coat,
ladle,
lampshade, lamp shade,
laptop, laptop computer,
lawn mower, mower,
lens cap, lens cover,
letter opener, paper knife, paperknife,
library,
lifeboat,
lighter, light, igniter, ignitor,
limousine, limo,
liner, ocean liner,
lipstick, lip rouge,
Loafer,
lotion,
loudspeaker, speaker, speaker unit, loudspeaker system, speaker system,
"loupe, jewelers loupe",
lumbermill, sawmill,
magnetic compass,
mailbag, postbag,
mailbox, letter box,
maillot,
maillot, tank suit,
manhole cover,
maraca,
marimba, xylophone,
mask,
matchstick,
maypole,
maze, labyrinth,
measuring cup,
medicine chest, medicine cabinet,
megalith, megalithic structure,
microphone, mike,
microwave, microwave oven,
military uniform,
milk can,
minibus,
miniskirt, mini,
minivan,
missile,
mitten,
mixing bowl,
mobile home, manufactured home,
Model T,
modem,
monastery,
monitor,
moped,
mortar,
mortarboard,
mosque,
mosquito net,
motor scooter, scooter,
mountain bike, all-terrain bike, off-roader,
mountain tent,
mouse, computer mouse,
mousetrap,
moving van,
muzzle,
nail,
neck brace,
necklace,
nipple,
notebook, notebook computer,
obelisk,
oboe, hautboy, hautbois,
ocarina, sweet potato,
odometer, hodometer, mileometer, milometer,
oil filter,
organ, pipe organ,
oscilloscope, scope, cathode-ray oscilloscope, CRO,
overskirt,
oxcart,
oxygen mask,
packet,
paddle, boat paddle,
paddlewheel, paddle wheel,
padlock,
paintbrush,
"pajama, pyjama, pjs, jammies",
palace,
panpipe, pandean pipe, syrinx,
paper towel,
parachute, chute,
parallel bars, bars,
park bench,
parking meter,
passenger car, coach, carriage,
patio, terrace,
pay-phone, pay-station,
pedestal, plinth, footstall,
pencil box, pencil case,
pencil sharpener,
perfume, essence,
Petri dish,
photocopier,
pick, plectrum, plectron,
pickelhaube,
picket fence, paling,
pickup, pickup truck,
pier,
piggy bank, penny bank,
pill bottle,
pillow,
ping-pong ball,
pinwheel,
pirate, pirate ship,
pitcher, ewer,
"plane, carpenters plane, woodworking plane",
planetarium,
plastic bag,
plate rack,
plow, plough,
"plunger, plumbers helper",
Polaroid camera, Polaroid Land camera,
pole,
police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria,
poncho,
pool table, billiard table, snooker table,
pop bottle, soda bottle,
pot, flowerpot,
"potters wheel",
power drill,
prayer rug, prayer mat,
printer,
prison, prison house,
projectile, missile,
projector,
puck, hockey puck,
punching bag, punch bag, punching ball, punchball,
purse,
quill, quill pen,
quilt, comforter, comfort, puff,
racer, race car, racing car,
racket, racquet,
radiator,
radio, wireless,
radio telescope, radio reflector,
rain barrel,
recreational vehicle, RV, R.V.,
reel,
reflex camera,
refrigerator, icebox,
remote control, remote,
restaurant, eating house, eating place, eatery,
revolver, six-gun, six-shooter,
rifle,
rocking chair, rocker,
rotisserie,
rubber eraser, rubber, pencil eraser,
rugby ball,
rule, ruler,
running shoe,
safe,
safety pin,
saltshaker, salt shaker,
sandal,
sarong,
sax, saxophone,
scabbard,
scale, weighing machine,
school bus,
schooner,
scoreboard,
screen, CRT screen,
screw,
screwdriver,
seat belt, seatbelt,
sewing machine,
shield, buckler,
shoe shop, shoe-shop, shoe store,
shoji,
shopping basket,
shopping cart,
shovel,
shower cap,
shower curtain,
ski,
ski mask,
sleeping bag,
slide rule, slipstick,
sliding door,
slot, one-armed bandit,
snorkel,
snowmobile,
snowplow, snowplough,
soap dispenser,
soccer ball,
sock,
solar dish, solar collector, solar furnace,
sombrero,
soup bowl,
space bar,
space heater,
space shuttle,
spatula,
speedboat,
"spider web, spiders web",
spindle,
sports car, sport car,
spotlight, spot,
stage,
steam locomotive,
steel arch bridge,
steel drum,
stethoscope,
stole,
stone wall,
stopwatch, stop watch,
stove,
strainer,
streetcar, tram, tramcar, trolley, trolley car,
stretcher,
studio couch, day bed,
stupa, tope,
submarine, pigboat, sub, U-boat,
suit, suit of clothes,
sundial,
sunglass,
sunglasses, dark glasses, shades,
sunscreen, sunblock, sun blocker,
suspension bridge,
swab, swob, mop,
sweatshirt,
swimming trunks, bathing trunks,
swing,
switch, electric switch, electrical switch,
syringe,
table lamp,
tank, army tank, armored combat vehicle, armoured combat vehicle,
tape player,
teapot,
teddy, teddy bear,
television, television system,
tennis ball,
thatch, thatched roof,
theater curtain, theatre curtain,
thimble,
thresher, thrasher, threshing machine,
throne,
tile roof,
toaster,
tobacco shop, tobacconist shop, tobacconist,
toilet seat,
torch,
totem pole,
tow truck, tow car, wrecker,
toyshop,
tractor,
trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi,
tray,
trench coat,
tricycle, trike, velocipede,
trimaran,
tripod,
triumphal arch,
trolleybus, trolley coach, trackless trolley,
trombone,
tub, vat,
turnstile,
typewriter keyboard,
umbrella,
unicycle, monocycle,
upright, upright piano,
vacuum, vacuum cleaner,
vase,
vault,
velvet,
vending machine,
vestment,
viaduct,
violin, fiddle,
volleyball,
waffle iron,
wall clock,
wallet, billfold, notecase, pocketbook,
wardrobe, closet, press,
warplane, military plane,
washbasin, handbasin, washbowl, lavabo, wash-hand basin,
washer, automatic washer, washing machine,
water bottle,
water jug,
water tower,
whiskey jug,
whistle,
wig,
window screen,
window shade,
Windsor tie,
wine bottle,
wing,
wok,
wooden spoon,
wool, woolen, woollen,
worm fence, snake fence, snake-rail fence, Virginia fence,
wreck,
yawl,
yurt,
web site, website, internet site, site,
comic book,
crossword puzzle, crossword,
street sign,
traffic light, traffic signal, stoplight,
book jacket, dust cover, dust jacket, dust wrapper,
menu,
plate,
guacamole,
consomme,
hot pot, hotpot,
trifle,
ice cream, icecream,
ice lolly, lolly, lollipop, popsicle,
French loaf,
bagel, beigel,
pretzel,
cheeseburger,
hotdog, hot dog, red hot,
mashed potato,
head cabbage,
broccoli,
cauliflower,
zucchini, courgette,
spaghetti squash,
acorn squash,
butternut squash,
cucumber, cuke,
artichoke, globe artichoke,
bell pepper,
cardoon,
mushroom,
Granny Smith,
strawberry,
orange,
lemon,
fig,
pineapple, ananas,
banana,
jackfruit, jak, jack,
custard apple,
pomegranate,
hay,
carbonara,
chocolate sauce, chocolate syrup,
dough,
meat loaf, meatloaf,
pizza, pizza pie,
potpie,
burrito,
red wine,
espresso,
cup,
eggnog,
alp,
bubble,
cliff, drop, drop-off,
coral reef,
geyser,
lakeside, lakeshore,
promontory, headland, head, foreland,
sandbar, sand bar,
seashore, coast, seacoast, sea-coast,
valley, vale,
volcano,
ballplayer, baseball player,
groom, bridegroom,
scuba diver,
rapeseed,
daisy,
"yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
corn,
acorn,
hip, rose hip, rosehip,
buckeye, horse chestnut, conker,
coral fungus,
agaric,
gyromitra,
stinkhorn, carrion fungus,
earthstar,
hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa,
bolete,
ear, spike, capitulum,
toilet tissue, toilet paper, bathroom tissue
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from paddle_serving_client import Client
#app
from paddle_serving_app.reader import Sequential, URL2Image, Resize
from paddle_serving_app.reader import CenterCrop, RGB2BGR, Transpose, Div, Normalize
import time
client = Client()
client.load_client_config("./ResNet50_vd_client/serving_client_conf.prototxt")
client.connect(["127.0.0.1:9394"])
label_dict = {}
label_idx = 0
with open("imagenet.label") as fin:
for line in fin:
label_dict[label_idx] = line.strip()
label_idx += 1
#preprocess
seq = Sequential([
URL2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose((2, 0, 1)),
Div(255), Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True)
])
start = time.time()
image_file = "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"
for i in range(1):
img = seq(image_file)
fetch_map = client.predict(feed={"inputs": img}, fetch=[], batch=False)
prob = max(fetch_map["save_infer_model/scale_0.tmp_1"][0])
label = label_dict[fetch_map["save_infer_model/scale_0.tmp_1"][0].tolist()
.index(prob)].strip().replace(",", "")
print("prediction: {}, probability: {}".format(label, prob))
end = time.time()
print(end - start)
## IMDB评论情绪预测服务
## IMDB 评论情绪预测 ABTest 服务
(简体中文|[English](./README.md))
......@@ -11,16 +11,24 @@ sh get_data.sh
### 启动预测服务(支持BRPC-Client/GRPC-Client/Http-Client)
```python
## 启动 bow 模型服务
python3 -m paddle_serving_server.serve --model imdb_bow_model/ --port 9297 >/dev/null 2>&1 &
## 启动 cnn 模型服务
python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9298 >/dev/null 2>&1 &
## 启动 lstm 模型服务
python3 -m paddle_serving_server.serve --model imdb_lstm_model/ --port 9299 >/dev/null 2>&1 &
```
python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
```
### BRPC-Client预测
### ABTest 预测
```
head test_data/part-0 | python3 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
head test_data/part-0 | python3 abtest_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
```
预测test_data/part-0的前十个样例。
### BRPC-Client预测
### http预测
```
head test_data/part-0 | python3 test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
```
......@@ -11,35 +11,35 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
from paddle_serving_client import Client
from paddle_serving_app.reader.imdb_reader import IMDBDataset
import sys
import numpy as np
client = Client()
client.load_client_config('imdb_bow_client_conf/serving_client_conf.prototxt')
client.add_variant("bow", ["127.0.0.1:8000"], 10)
client.add_variant("lstm", ["127.0.0.1:9000"], 90)
client.load_client_config(sys.argv[1])
client.add_variant("bow", ["127.0.0.1:9297"], 10)
client.add_variant("cnn", ["127.0.0.1:9298"], 30)
client.add_variant("lstm", ["127.0.0.1:9299"], 60)
client.connect()
print('please wait for about 10s')
with open('processed.data') as f:
cnt = {"bow": {'acc': 0, 'total': 0}, "lstm": {'acc': 0, 'total': 0}}
for line in f:
word_ids, label = line.split(';')
word_ids = [int(x) for x in word_ids.split(',')]
# you can define any english sentence or dataset here
# This example reuses imdb reader in training, you
# can define your own data preprocessing easily.
imdb_dataset = IMDBDataset()
imdb_dataset.load_resource(sys.argv[2])
for line in sys.stdin:
word_ids, label = imdb_dataset.get_words_and_label(line)
word_len = len(word_ids)
feed = {
"words": np.array(word_ids).reshape(word_len, 1),
"words.lod": [0, word_len]
}
fetch = ["acc", "cost", "prediction"]
[fetch_map, tag] = client.predict(
feed=feed, fetch=fetch, need_variant_tag=True, batch=True)
if (float(fetch_map["prediction"][0][1]) - 0.5) * (float(label[0]) - 0.5
) > 0:
cnt[tag]['acc'] += 1
cnt[tag]['total'] += 1
for tag, data in cnt.items():
print('[{}](total: {}) acc: {}'.format(tag, data[
'total'], float(data['acc']) / float(data['total'])))
#print(feed)
fetch = ["prediction"]
fetch_map = client.predict(
feed=feed, fetch=fetch, batch=True, need_variant_tag=True)
print("server_tag={} prediction={} ".format(fetch_map[1], fetch_map[0][
"prediction"][0]))
......@@ -40,7 +40,7 @@ we recommend use Proto data format in HTTP-body, set True(which is default)
if you want use JSON data format in HTTP-body, set False
'''
#client.set_http_proto(True)
client.connect(["127.0.0.1:9292"])
client.connect(["127.0.0.1:9297"])
# you can define any english sentence or dataset here
# This example reuses imdb reader in training, you
......
# In-batch Negatives
**目录**
* [模型下载](#模型下载)
* [模型部署](#模型部署)
<a name="模型下载"></a>
## 1. 语义索引模型
**语义索引训练模型下载链接:**
以下模型结构参数为: `TrasformerLayer:12, Hidden:768, Heads:12, OutputEmbSize: 256`
|Model|训练参数配置|硬件|MD5|
| ------------ | ------------ | ------------ |-----------|
|[batch_neg](https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip)|<div style="width: 150pt">margin:0.2 scale:30 epoch:3 lr:5E-5 bs:64 max_len:64 </div>|<div style="width: 100pt">4卡 v100-16g</div>|f3e5c7d7b0b718c2530c5e1b136b2d74|
```
wget https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip
unzip inbatch_model.zip -d checkpoints
```
<a name="模型部署"></a>
## 2. 模型部署
### 2.1 动转静导出
首先把动态图模型转换为静态图:
```
python export_model.py --params_path checkpoints/model_40/model_state.pdparams --output_path=./output
```
也可以运行下面的bash脚本:
```
sh scripts/export_model.sh
```
### 2.2 Paddle Inference预测
预测既可以抽取向量也可以计算两个文本的相似度。
修改id2corpus的样本:
```
# 抽取向量
id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
# 计算相似度
corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
```
然后使用PaddleInference
```
python deploy/python/predict.py --model_dir=./output
```
也可以运行下面的bash脚本:
```
sh deploy.sh
```
最终输出的是256维度的特征向量和句子对的预测概率:
```
(1, 256)
[[-0.0394925 -0.04474756 -0.065534 0.00939134 0.04359895 0.14659195
-0.0091779 -0.07303623 0.09413272 -0.01255222 -0.08685658 0.02762237
0.10138468 0.00962821 0.10888419 0.04553023 0.05898942 0.00694253
....
[0.959269642829895, 0.04725276678800583]
```
### 2.3 Paddle Serving部署
Paddle Serving 的详细文档请参考 [Pipeline_Design](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Python_Pipeline/Pipeline_Design_CN.md)[Serving_Design](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Serving_Design_CN.md),首先把静态图模型转换成Serving的格式:
```
python export_to_serving.py \
--dirname "output" \
--model_filename "inference.get_pooled_embedding.pdmodel" \
--params_filename "inference.get_pooled_embedding.pdiparams" \
--server_path "./serving_server" \
--client_path "./serving_client" \
--fetch_alias_names "output_embedding"
```
参数含义说明
* `dirname`: 需要转换的模型文件存储路径,Program 结构文件和参数文件均保存在此目录。
* `model_filename`: 存储需要转换的模型 Inference Program 结构的文件名称。如果设置为 None ,则使用 `__model__` 作为默认的文件名
* `params_filename`: 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为 None
* `server_path`: 转换后的模型文件和配置文件的存储路径。默认值为 serving_server
* `client_path`: 转换后的客户端配置文件存储路径。默认值为 serving_client
* `fetch_alias_names`: 模型输出的别名设置,比如输入的 input_ids 等,都可以指定成其他名字,默认不指定
* `feed_alias_names`: 模型输入的别名设置,比如输出 pooled_out 等,都可以重新指定成其他模型,默认不指定
也可以运行下面的 bash 脚本:
```
sh scripts/export_to_serving.sh
```
Paddle Serving的部署有两种方式,第一种方式是Pipeline的方式,第二种是C++的方式,下面分别介绍这两种方式的用法:
#### 2.3.1 Pipeline方式
启动 Pipeline Server:
```
python web_service.py
```
启动客户端调用 Server。
首先修改rpc_client.py中需要预测的样本:
```
list_data = [
"国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据",
"试论翻译过程中的文化差异与语言空缺翻译过程,文化差异,语言空缺,文化对比"
]
```
然后运行:
```
python rpc_client.py
```
模型的输出为:
```
{'0': '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据', '1': '试论翻译过程中的文化差异与语言空缺翻译过程,文化差异,语言空缺,文化对比'}
PipelineClient::predict pack_data time:1641450851.3752182
PipelineClient::predict before time:1641450851.375738
['output_embedding']
(2, 256)
[[ 0.07830612 -0.14036864 0.03433796 -0.14967982 -0.03386067 0.06630666
0.01357943 0.03531194 0.02411093 0.02000859 0.05724002 -0.08119463
......
```
可以看到客户端发送了2条文本,返回了2个 embedding 向量
#### 2.3.2 C++的方式
启动C++的Serving:
```
python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_id 2 --thread 5 --ir_optim True --use_trt --precision FP16
```
也可以使用脚本:
```
sh deploy/C++/start_server.sh
```
Client 可以使用 http 或者 rpc 两种方式,rpc 的方式为:
```
python deploy/C++/rpc_client.py
```
运行的输出为:
```
I0209 20:40:07.978225 20896 general_model.cpp:490] [client]logid=0,client_cost=395.695ms,server_cost=392.559ms.
time to cost :0.3960278034210205 seconds
{'output_embedding': array([[ 9.01343748e-02, -1.21870913e-01, 1.32834800e-02,
-1.57673359e-01, -2.60387752e-02, 6.98455423e-02,
1.58108603e-02, 3.89952064e-02, 3.22783105e-02,
3.49135026e-02, 7.66086206e-02, -9.12970975e-02,
6.25643134e-02, 7.21886680e-02, 7.03565404e-02,
5.44054210e-02, 3.25332815e-03, 5.01751155e-02,
......
```
可以看到服务端返回了向量
或者使用 http 的客户端访问模式:
```
python deploy/C++/http_client.py
```
运行的输出为:
```
(2, 64)
(2, 64)
outputs {
tensor {
float_data: 0.09013437479734421
float_data: -0.12187091261148453
float_data: 0.01328347995877266
float_data: -0.15767335891723633
......
```
可以看到服务端返回了向量
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import sys
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class SemanticIndexBase(nn.Layer):
def __init__(self, pretrained_model, dropout=None, output_emb_size=None):
super().__init__()
self.ptm = pretrained_model
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
# if output_emb_size is not None, then add Linear layer to reduce embedding_size,
# we recommend set output_emb_size = 256 considering the trade-off beteween
# recall performance and efficiency
self.output_emb_size = output_emb_size
if output_emb_size > 0:
weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
self.emb_reduce_linear = paddle.nn.Linear(
768, output_emb_size, weight_attr=weight_attr)
@paddle.jit.to_static(input_spec=[
paddle.static.InputSpec(
shape=[None, None], dtype='int64'), paddle.static.InputSpec(
shape=[None, None], dtype='int64')
])
def get_pooled_embedding(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
_, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
attention_mask)
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
text_embeddings = self.get_pooled_embedding(
input_ids, token_type_ids=token_type_ids)
yield text_embeddings
def cosine_sim(self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids,
query_attention_mask)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids,
title_attention_mask)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding,
axis=-1)
return cosine_sim
@abc.abstractmethod
def forward(self):
pass
class SemanticIndexBaseStatic(nn.Layer):
def __init__(self, pretrained_model, dropout=None, output_emb_size=None):
super().__init__()
self.ptm = pretrained_model
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
# if output_emb_size is not None, then add Linear layer to reduce embedding_size,
# we recommend set output_emb_size = 256 considering the trade-off beteween
# recall performance and efficiency
self.output_emb_size = output_emb_size
if output_emb_size > 0:
weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
self.emb_reduce_linear = paddle.nn.Linear(
768, output_emb_size, weight_attr=weight_attr)
@paddle.jit.to_static(input_spec=[
paddle.static.InputSpec(
shape=[None, None], dtype='int64'), paddle.static.InputSpec(
shape=[None, None], dtype='int64')
])
def get_pooled_embedding(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
_, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
attention_mask)
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
text_embeddings = self.get_pooled_embedding(
input_ids, token_type_ids=token_type_ids)
yield text_embeddings
def cosine_sim(self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids,
query_attention_mask)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids,
title_attention_mask)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding,
axis=-1)
return cosine_sim
def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
_, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
attention_mask)
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
# coding:utf-8
# pylint: disable=doc-string-missing
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
import numpy as np
import requests
import json
from paddle_serving_client import HttpClient
import paddlenlp as ppnlp
def convert_example(example,
tokenizer,
max_seq_length=512,
pad_to_max_seq_len=True):
list_input_ids = []
list_token_type_ids = []
for text in example:
encoded_inputs = tokenizer(
text=text,
max_seq_len=max_seq_length,
pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
list_input_ids.append(input_ids)
list_token_type_ids.append(token_type_ids)
return list_input_ids, list_token_type_ids
# 启动python客户端
endpoint_list = ['127.0.0.1:9393']
client = HttpClient()
client.load_client_config('serving_client')
client.connect(endpoint_list)
feed_names = client.feed_names_
fetch_names = client.fetch_names_
print(feed_names)
print(fetch_names)
# 创建tokenizer
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
max_seq_len = 64
# 数据预处理
list_data = ['国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据.', '面向生态系统服务的生态系统分类方案研发与应用']
# for i in range(5):
# list_data.extend(list_data)
# print(len(list_data))
examples = convert_example(list_data, tokenizer, max_seq_length=max_seq_len)
print(examples)
feed_dict = {}
feed_dict['input_ids'] = np.array(examples[0])
feed_dict['token_type_ids'] = np.array(examples[1])
print(feed_dict['input_ids'].shape)
print(feed_dict['token_type_ids'].shape)
# batch设置为True表示的是批量预测
b_start = time.time()
result = client.predict(feed=feed_dict, fetch=fetch_names, batch=True)
b_end = time.time()
print(result)
print("time to cost :{} seconds".format(b_end - b_start))
# coding:utf-8
# pylint: disable=doc-string-missing
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
import numpy as np
from paddle_serving_client import Client
import paddlenlp as ppnlp
def convert_example(example,
tokenizer,
max_seq_length=512,
pad_to_max_seq_len=True):
list_input_ids = []
list_token_type_ids = []
for text in example:
encoded_inputs = tokenizer(
text=text,
max_seq_len=max_seq_length,
pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
list_input_ids.append(input_ids)
list_token_type_ids.append(token_type_ids)
return list_input_ids, list_token_type_ids
# 启动python客户端
endpoint_list = ['127.0.0.1:9393']
client = Client()
client.load_client_config('serving_client')
client.connect(endpoint_list)
feed_names = client.feed_names_
fetch_names = client.fetch_names_
print(feed_names)
print(fetch_names)
# 创建tokenizer
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
max_seq_len = 64
# 数据预处理
list_data = ['国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据.', '面向生态系统服务的生态系统分类方案研发与应用']
# for i in range(5):
# list_data.extend(list_data)
# print(len(list_data))
examples = convert_example(list_data, tokenizer, max_seq_length=max_seq_len)
print(examples)
feed_dict = {}
feed_dict['input_ids'] = np.array(examples[0])
feed_dict['token_type_ids'] = np.array(examples[1])
print(feed_dict['input_ids'].shape)
print(feed_dict['token_type_ids'].shape)
# batch设置为True表示的是批量预测
b_start = time.time()
result = client.predict(feed=feed_dict, fetch=fetch_names, batch=True)
b_end = time.time()
print("time to cost :{} seconds".format(b_end - b_start))
print(result)
python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_id 2 --thread 5 --ir_optim True --use_trt --precision FP16
\ No newline at end of file
# worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程,每个进程内构建grpcSever和DAG
# 当build_dag_each_worker=False时,框架会设置主线程grpc线程池的max_workers=worker_num
worker_num: 20
# build_dag_each_worker, False,框架在进程内创建一条DAG;True,框架会每个进程内创建多个独立的DAG
build_dag_each_worker: false
dag:
# op资源类型, True, 为线程模型;False,为进程模型
is_thread_op: False
# 使用性能分析, True,生成Timeline性能数据,对性能有一定影响;False为不使用
tracer:
interval_s: 10
# http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时,不自动生成http_port
http_port: 18082
# rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时,会自动将rpc_port设置为http_port+1
rpc_port: 8088
op:
ernie:
# 并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 1
# 当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
# client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
#ir_optim
ir_optim: True
# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 1
# 计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: '2'
# Fetch结果列表,以client_config中fetch_var的alias_name为准, 如果没有设置则全部返回
fetch_list: ['output_embedding']
# 模型路径
model_config: ../../serving_server/
python predict.py --model_dir=../../output
\ No newline at end of file
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import sys
import numpy as np
import paddle
import paddlenlp as ppnlp
from scipy.special import softmax
from scipy import spatial
from paddle import inference
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.datasets import load_dataset
from paddlenlp.utils.log import logger
sys.path.append('.')
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True,
help="The directory to static model.")
parser.add_argument("--max_seq_length", default=128, type=int,
help="The maximum total input sequence length after tokenization. Sequences "
"longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=15, type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu",
help="Select which device to train model, defaults to gpu.")
parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False],
help='Enable to use tensorrt to speed up.')
parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"],
help='The tensorrt precision.')
parser.add_argument('--cpu_threads', default=10, type=int,
help='Number of threads to predict when using cpu.')
parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False],
help='Enable to use mkldnn to speed up when using cpu.')
parser.add_argument("--benchmark", type=eval, default=False,
help="To log some information about environment and running.")
parser.add_argument("--save_log_path", type=str, default="./log_output/",
help="The file path to save log.")
args = parser.parse_args()
# yapf: enable
def convert_example(example,
tokenizer,
max_seq_length=512,
pad_to_max_seq_len=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(
text=text,
max_seq_len=max_seq_length,
pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
class Predictor(object):
def __init__(self,
model_dir,
device="gpu",
max_seq_length=128,
batch_size=32,
use_tensorrt=False,
precision="fp32",
cpu_threads=10,
enable_mkldnn=False):
self.max_seq_length = max_seq_length
self.batch_size = batch_size
model_file = model_dir + "/inference.pdmodel"
params_file = model_dir + "/inference.pdiparams"
if not os.path.exists(model_file):
raise ValueError("not find model file path {}".format(model_file))
if not os.path.exists(params_file):
raise ValueError("not find params file path {}".format(params_file))
config = paddle.inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
# such as intialize the gpu memory, enable tensorrt
config.enable_use_gpu(100, 0)
precision_map = {
"fp16": inference.PrecisionType.Half,
"fp32": inference.PrecisionType.Float32,
"int8": inference.PrecisionType.Int8
}
precision_mode = precision_map[precision]
if args.use_tensorrt:
config.enable_tensorrt_engine(
max_batch_size=batch_size,
min_subgraph_size=30,
precision_mode=precision_mode)
elif device == "cpu":
# set CPU configs accordingly,
# such as enable_mkldnn, set_cpu_math_library_num_threads
config.disable_gpu()
if args.enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
config.set_cpu_math_library_num_threads(args.cpu_threads)
elif device == "xpu":
# set XPU configs accordingly
config.enable_xpu(100)
config.switch_use_feed_fetch_ops(False)
self.predictor = paddle.inference.create_predictor(config)
self.input_handles = [
self.predictor.get_input_handle(name)
for name in self.predictor.get_input_names()
]
self.output_handle = self.predictor.get_output_handle(
self.predictor.get_output_names()[0])
if args.benchmark:
import auto_log
pid = os.getpid()
self.autolog = auto_log.AutoLogger(
model_name="ernie-1.0",
model_precision=precision,
batch_size=self.batch_size,
data_shape="dynamic",
save_path=args.save_log_path,
inference_config=config,
pids=pid,
process_name=None,
gpu_ids=0,
time_keys=[
'preprocess_time', 'inference_time', 'postprocess_time'
],
warmup=0,
logger=logger)
def extract_embedding(self, data, tokenizer):
"""
Predicts the data labels.
Args:
data (obj:`List(str)`): The batch data whose each element is a raw text.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
Returns:
results(obj:`dict`): All the feature vectors.
"""
if args.benchmark:
self.autolog.times.start()
examples = []
for text in data:
input_ids, segment_ids = convert_example(text, tokenizer)
examples.append((input_ids, segment_ids))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
): fn(samples)
if args.benchmark:
self.autolog.times.stamp()
input_ids, segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(input_ids)
self.input_handles[1].copy_from_cpu(segment_ids)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
if args.benchmark:
self.autolog.times.stamp()
if args.benchmark:
self.autolog.times.end(stamp=True)
return logits
def predict(self, data, tokenizer):
"""
Predicts the data labels.
Args:
data (obj:`List(str)`): The batch data whose each element is a raw text.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
Returns:
results(obj:`dict`): All the predictions probs.
"""
if args.benchmark:
self.autolog.times.start()
examples = []
for idx, text in enumerate(data):
input_ids, segment_ids = convert_example({idx: text[0]}, tokenizer)
title_ids, title_segment_ids = convert_example({
idx: text[1]
}, tokenizer)
examples.append(
(input_ids, segment_ids, title_ids, title_segment_ids))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
): fn(samples)
if args.benchmark:
self.autolog.times.stamp()
query_ids, query_segment_ids, title_ids, title_segment_ids = batchify_fn(
examples)
self.input_handles[0].copy_from_cpu(query_ids)
self.input_handles[1].copy_from_cpu(query_segment_ids)
self.predictor.run()
query_logits = self.output_handle.copy_to_cpu()
self.input_handles[0].copy_from_cpu(title_ids)
self.input_handles[1].copy_from_cpu(title_segment_ids)
self.predictor.run()
title_logits = self.output_handle.copy_to_cpu()
if args.benchmark:
self.autolog.times.stamp()
if args.benchmark:
self.autolog.times.end(stamp=True)
result = [
float(1 - spatial.distance.cosine(arr1, arr2))
for arr1, arr2 in zip(query_logits, title_logits)
]
return result
if __name__ == "__main__":
# Define predictor to do prediction.
predictor = Predictor(args.model_dir, args.device, args.max_seq_length,
args.batch_size, args.use_tensorrt, args.precision,
args.cpu_threads, args.enable_mkldnn)
# ErnieTinyTokenizer is special for ernie-tiny pretained model.
output_emb_size = 256
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
res = predictor.extract_embedding(corpus_list, tokenizer)
print(res.shape)
print(res)
corpus_list = [['中西方语言与文化的差异', '中西方文化差异以及语言体现中西方文化,差异,语言体现'],
['中西方语言与文化的差异', '飞桨致力于让深度学习技术的创新与应用更简单']]
res = predictor.predict(corpus_list, tokenizer)
print(res)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import numpy as np
from paddle_serving_server.pipeline import PipelineClient
client = PipelineClient()
client.connect(['127.0.0.1:8088'])
list_data = [
"国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据",
"试论翻译过程中的文化差异与语言空缺翻译过程,文化差异,语言空缺,文化对比"
]
feed = {}
for i, item in enumerate(list_data):
feed[str(i)] = item
print(feed)
start_time = time.time()
ret = client.predict(feed_dict=feed)
end_time = time.time()
print("time to cost :{} seconds".format(end_time - start_time))
result = np.array(eval(ret.value[0]))
print(ret.key)
print(result.shape)
print(result)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
import sys
from paddle_serving_server.web_service import WebService, Op
_LOGGER = logging.getLogger()
def convert_example(example,
tokenizer,
max_seq_length=512,
pad_to_max_seq_len=False):
result = []
for text in example:
encoded_inputs = tokenizer(
text=text,
max_seq_len=max_seq_length,
pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
class ErnieOp(Op):
def init_op(self):
import paddlenlp as ppnlp
self.tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained(
'ernie-1.0')
def preprocess(self, input_dicts, data_id, log_id):
from paddlenlp.data import Stack, Tuple, Pad
(_, input_dict), = input_dicts.items()
print("input dict", input_dict)
batch_size = len(input_dict.keys())
examples = []
for i in range(batch_size):
input_ids, segment_ids = convert_example([input_dict[str(i)]],
self.tokenizer)
examples.append((input_ids, segment_ids))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # segment
): fn(samples)
input_ids, segment_ids = batchify_fn(examples)
feed_dict = {}
feed_dict['input_ids'] = input_ids
feed_dict['token_type_ids'] = segment_ids
return feed_dict, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
new_dict = {}
new_dict["output_embedding"] = str(fetch_dict["output_embedding"]
.tolist())
return new_dict, None, ""
class ErnieService(WebService):
def get_pipeline_response(self, read_op):
ernie_op = ErnieOp(name="ernie", input_ops=[read_op])
return ernie_op
ernie_service = ErnieService(name="ernie")
ernie_service.prepare_pipeline_config("config_nlp.yml")
ernie_service.run_service()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from functools import partial
import numpy as np
import paddle
import paddle.nn.functional as F
import paddlenlp as ppnlp
from paddlenlp.data import Stack, Tuple, Pad
from base_model import SemanticIndexBase, SemanticIndexBaseStatic
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--params_path", type=str, required=True,
default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.")
parser.add_argument("--output_path", type=str, default='./output',
help="The path of model parameter in static graph to be saved.")
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
# If you want to use ernie1.0 model, plesace uncomment the following code
output_emb_size = 256
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
"ernie-1.0")
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
model = SemanticIndexBaseStatic(
pretrained_model, output_emb_size=output_emb_size)
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.params_path)
model.eval()
# Convert to static graph with specific input description
model = paddle.jit.to_static(
model,
input_spec=[
paddle.static.InputSpec(
shape=[None, None], dtype="int64"), # input_ids
paddle.static.InputSpec(
shape=[None, None], dtype="int64") # segment_ids
])
# Save in static graph model.
save_path = os.path.join(args.output_path, "inference")
paddle.jit.save(model, save_path)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle_serving_client.io as serving_io
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--dirname", type=str, required=True,
default='./output', help="Path of saved model files. Program file and parameter files are saved in this directory.")
parser.add_argument("--model_filename", type=str, required=True,
default='inference.get_pooled_embedding.pdmodel', help="The name of file to load the inference program. If it is None, the default filename __model__ will be used.")
parser.add_argument("--params_filename", type=str, required=True,
default='inference.get_pooled_embedding.pdiparams', help="The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.")
parser.add_argument("--server_path", type=str, default='./serving_server',
help="The path of server parameter in static graph to be saved.")
parser.add_argument("--client_path", type=str, default='./serving_client',
help="The path of client parameter in static graph to be saved.")
parser.add_argument("--feed_alias_names", type=str, default=None,
help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of feed vars')
parser.add_argument("--fetch_alias_names", type=str, default=None,
help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of fetch vars')
parser.add_argument("--show_proto", type=bool, default=False,
help='If yes, you can preview the proto and then determine your feed var alias name and fetch var alias name.')
# yapf: enable
if __name__ == "__main__":
args = parser.parse_args()
serving_io.inference_model_to_serving(
dirname=args.dirname,
serving_server=args.server_path,
serving_client=args.client_path,
model_filename=args.model_filename,
params_filename=args.params_filename,
show_proto=args.show_proto,
feed_alias_names=args.feed_alias_names,
fetch_alias_names=args.fetch_alias_names)
python export_model.py --params_path checkpoints/model_40/model_state.pdparams --output_path=./output
\ No newline at end of file
python export_to_serving.py \
--dirname "output" \
--model_filename "inference.get_pooled_embedding.pdmodel" \
--params_filename "inference.get_pooled_embedding.pdiparams" \
--server_path "serving_server" \
--client_path "serving_client" \
--fetch_alias_names "output_embedding"
......@@ -89,11 +89,13 @@ message Request {
message Response {
repeated ModelOutput outputs = 1;
repeated int64 profile_time = 2;
// Error code
int32 err_no = 3;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 5;
// Error messages
string err_msg = 4;
string err_msg = 6;
};
message ModelOutput {
......
......@@ -241,10 +241,10 @@ class PaddleInferenceEngine : public EngineCore {
}
config.SwitchSpecifyInputNames(true);
config.SetCpuMathLibraryNumThreads(1);
config.SetCpuMathLibraryNumThreads(engine_conf.cpu_math_thread_num());
if (engine_conf.has_use_gpu() && engine_conf.use_gpu()) {
// 2000MB GPU memory
config.EnableUseGpu(50, gpu_id);
config.EnableUseGpu(engine_conf.gpu_memory_mb(), gpu_id);
if (engine_conf.has_gpu_multi_stream() &&
engine_conf.gpu_multi_stream()) {
config.EnableGpuMultiStream();
......@@ -267,17 +267,17 @@ class PaddleInferenceEngine : public EngineCore {
if (engine_conf.has_use_trt() && engine_conf.use_trt()) {
config.SwitchIrOptim(true);
if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) {
config.EnableUseGpu(50, gpu_id);
config.EnableUseGpu(engine_conf.gpu_memory_mb(), gpu_id);
if (engine_conf.has_gpu_multi_stream() &&
engine_conf.gpu_multi_stream()) {
config.EnableGpuMultiStream();
}
}
config.EnableTensorRtEngine(1 << 25,
config.EnableTensorRtEngine(engine_conf.trt_workspace_size(),
max_batch,
local_min_subgraph_size,
precision_type,
false,
engine_conf.trt_use_static(),
FLAGS_use_calib);
std::map<std::string, std::vector<int>> min_input_shape;
std::map<std::string, std::vector<int>> max_input_shape;
......@@ -413,7 +413,11 @@ class PaddleInferenceEngine : public EngineCore {
<< ", use_ascend_cl: " << engine_conf.has_use_ascend_cl()
<< ", use_xpu: " << engine_conf.use_xpu()
<< ", enable_memory_optimization: "
<< engine_conf.enable_memory_optimization();
<< engine_conf.enable_memory_optimization()
<< ", gpu_memory_mb: " << engine_conf.gpu_memory_mb()
<< ", cpu_math_thread_num: " << engine_conf.cpu_math_thread_num()
<< ", trt_workspace_size: " << engine_conf.trt_workspace_size()
<< ", trt_use_static: " << engine_conf.trt_use_static();
VLOG(2) << "create paddle predictor sucess, path: " << model_path;
return 0;
......
......@@ -93,7 +93,9 @@ class LocalPredictor(object):
use_ascend_cl=False,
min_subgraph_size=3,
dynamic_shape_info={},
use_calib=False):
use_calib=False,
collect_shape_range_info="",
tuned_dynamic_shape_info=""):
"""
Load model configs and create the paddle predictor by Paddle Inference API.
......@@ -160,12 +162,14 @@ class LocalPredictor(object):
"use_trt:{}, use_lite:{}, use_xpu:{}, precision:{}, use_calib:{}, "
"use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
"mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, "
"use_ascend_cl:{}, min_subgraph_size:{}, dynamic_shape_info:{}".
"use_ascend_cl:{}, min_subgraph_size:{}, dynamic_shape_info:{},"
"collect_shape_range_info:{},tuned_dynamic_shape_info:{}".
format(model_path, use_gpu, gpu_id, use_profile, thread_num,
mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision,
use_calib, use_mkldnn, mkldnn_cache_capacity, mkldnn_op_list,
mkldnn_bf16_op_list, use_feed_fetch_ops, use_ascend_cl,
min_subgraph_size, dynamic_shape_info))
min_subgraph_size, dynamic_shape_info,
collect_shape_range_info,tuned_dynamic_shape_info))
self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
......@@ -213,6 +217,8 @@ class LocalPredictor(object):
if mkldnn_op_list is not None:
config.set_mkldnn_op(mkldnn_op_list)
# set gpu
if collect_shape_range_info != "":
config.collect_shape_range_info(collect_shape_range_info)
if not use_gpu:
config.disable_gpu()
else:
......@@ -226,6 +232,9 @@ class LocalPredictor(object):
use_static=False,
use_calib_mode=use_calib)
if tuned_dynamic_shape_info != "":
config.enable_tuned_tensorrt_dynamic_shape(tuned_dynamic_shape_info, True)
@ErrorCatch
@ParamChecker
def dynamic_shape_info_helper(dynamic_shape_info:lambda dynamic_shape_info: check_dynamic_shape_info(dynamic_shape_info)):
......@@ -235,7 +244,7 @@ class LocalPredictor(object):
print("dynamic_shape_info configure error, it should contain [min_input_shape', 'max_input_shape', 'opt_input_shape' {}".format(resp.err_msg))
kill_stop_process_by_pid("kill", os.getpgid(os.getpid()))
if len(dynamic_shape_info):
if len(dynamic_shape_info) and tuned_dynamic_shape_info == "":
config.set_trt_dynamic_shape_info(
dynamic_shape_info['min_input_shape'],
dynamic_shape_info['max_input_shape'],
......
......@@ -280,6 +280,27 @@ def serve_args():
default="",
nargs="+",
help="min_subgraph_size")
parser.add_argument(
"--gpu_memory_mb",
type=int,
default=50,
help="Initially allocate GPU storage size")
parser.add_argument(
"--cpu_math_thread_num",
type=int,
default=1,
help="Initialize the number of CPU computing threads")
parser.add_argument(
"--trt_workspace_size",
type=int,
default=33554432,
help="Initialize allocation 1 << 25 GPU storage size")
parser.add_argument(
"--trt_use_static",
default=False,
action="store_true",
help="Initialize TRT with static data")
return parser.parse_args()
......@@ -396,10 +417,14 @@ def start_gpu_card_model(gpu_mode, port, args): # pylint: disable=doc-string-mi
server.set_dist_endpoints(args.dist_endpoints.split(","))
server.set_dist_subgraph_index(args.dist_subgraph_index)
server.set_min_subgraph_size(args.min_subgraph_size)
server.set_gpu_memory_mb(args.gpu_memory_mb)
server.set_cpu_math_thread_num(args.cpu_math_thread_num)
if args.use_trt and device == "gpu":
server.set_trt()
server.set_ir_optimize(True)
server.set_trt_workspace_size(args.trt_workspace_size)
server.set_trt_use_static(args.trt_use_static)
if is_ocr:
info = set_ocr_dynamic_shape_info()
server.set_trt_dynamic_shape_info(info)
......
......@@ -119,6 +119,10 @@ class Server(object):
self.dist_master_serving = False
self.min_subgraph_size = []
self.trt_dynamic_shape_info = []
self.gpu_memory_mb = 50
self.cpu_math_thread_num = 1
self.trt_workspace_size = 33554432 # 1 << 25
self.trt_use_static = False
def get_fetch_list(self, infer_node_idx=-1):
fetch_names = [
......@@ -289,6 +293,18 @@ class Server(object):
def set_trt_dynamic_shape_info(self, info):
self.trt_dynamic_shape_info = info
def set_gpu_memory_mb(self, gpu_memory_mb):
self.gpu_memory_mb = gpu_memory_mb
def set_cpu_math_thread_num(self, cpu_math_thread_num):
self.cpu_math_thread_num = cpu_math_thread_num
def set_trt_workspace_size(self, trt_workspace_size):
self.trt_workspace_size = trt_workspace_size
def set_trt_use_static(self, trt_use_static):
self.trt_use_static = trt_use_static
def _prepare_engine(self, model_config_paths, device, use_encryption_model):
self.device = device
if self.model_toolkit_conf == None:
......@@ -342,6 +358,10 @@ class Server(object):
engine.use_xpu = self.use_xpu
engine.use_ascend_cl = self.use_ascend_cl
engine.use_gpu = False
#engine.gpu_memory_mb = self.gpu_memory_mb
#engine.cpu_math_thread_num = self.cpu_math_thread_num
#engine.trt_workspace_size = self.trt_workspace_size
#engine.trt_use_static = self.trt_use_static
# use distributed model.
if self.dist_subgraph_index >= 0:
......
......@@ -53,7 +53,9 @@ class LocalServiceHandler(object):
mkldnn_bf16_op_list=None,
min_subgraph_size=3,
dynamic_shape_info={},
use_calib=False):
use_calib=False,
collect_shape_range_info="",
tuned_dynamic_shape_info=""):
"""
Initialization of localservicehandler
......@@ -99,6 +101,8 @@ class LocalServiceHandler(object):
self.min_subgraph_size = 3
self.dynamic_shape_info = {}
self._use_calib = False
self.collect_shape_range_info = ""
self.tuned_dynamic_shape_info = ""
if device_type == -1:
# device_type is not set, determined by `devices`,
......@@ -179,6 +183,8 @@ class LocalServiceHandler(object):
self._mkldnn_op_list = mkldnn_op_list
self._mkldnn_bf16_op_list = mkldnn_bf16_op_list
self._use_calib = use_calib
self.collect_shape_range_info = collect_shape_range_info
self.tuned_dynamic_shape_info = tuned_dynamic_shape_info
_LOGGER.info(
"Models({}) will be launched by device {}. use_gpu:{}, "
......@@ -187,14 +193,16 @@ class LocalServiceHandler(object):
"client_type:{}, fetch_names:{}, precision:{}, use_calib:{}, "
"use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
"mkldnn_bf16_op_list:{}, use_ascend_cl:{}, min_subgraph_size:{},"
"is_set_dynamic_shape_info:{}".format(
"is_set_dynamic_shape_info:{},collect_shape_range_info:{},"
"tuned_dynamic_shape_info:{}".format(
model_config, self._device_name, self._use_gpu, self._use_trt,
self._use_lite, self._use_xpu, device_type, self._devices, self.
_mem_optim, self._ir_optim, self._use_profile, self._thread_num,
self._client_type, self._fetch_names, self._precision, self.
_use_calib, self._use_mkldnn, self._mkldnn_cache_capacity, self.
_mkldnn_op_list, self._mkldnn_bf16_op_list, self._use_ascend_cl,
self.min_subgraph_size, bool(len(self.dynamic_shape_info))))
self.min_subgraph_size, bool(len(self.dynamic_shape_info)),
self.collect_shape_range_info, self.tuned_dynamic_shape_info))
def get_fetch_list(self):
return self._fetch_names
......@@ -254,7 +262,9 @@ class LocalServiceHandler(object):
use_ascend_cl=self._use_ascend_cl,
min_subgraph_size=self.min_subgraph_size,
dynamic_shape_info=self.dynamic_shape_info,
use_calib=self._use_calib)
use_calib=self._use_calib,
collect_shape_range_info=self.collect_shape_range_info,
tuned_dynamic_shape_info=self.tuned_dynamic_shape_info)
return self._local_predictor_client
def get_client_config(self):
......
......@@ -121,6 +121,8 @@ class Op(object):
self._succ_close_op = False
self.dynamic_shape_info = {}
self.set_dynamic_shape_info()
self.collect_shape_range_info = ""
self.tuned_dynamic_shape_info = ""
def set_dynamic_shape_info(self):
"""
......@@ -235,6 +237,14 @@ class Op(object):
"mkldnn_bf16_op_list")
self.min_subgraph_size = local_service_conf.get(
"min_subgraph_size")
self.collect_shape_range_info = local_service_conf.get(
"collect_shape_range_info")
self.tuned_dynamic_shape_info = local_service_conf.get(
"tuned_dynamic_shape_info")
if self.collect_shape_range_info is None:
self.collect_shape_range_info = ""
if self.tuned_dynamic_shape_info is None:
self.tuned_dynamic_shape_info = ""
if self.model_config is None:
self.with_serving = False
......@@ -259,7 +269,9 @@ class Op(object):
mkldnn_bf16_op_list=self.mkldnn_bf16_op_list,
min_subgraph_size=self.min_subgraph_size,
dynamic_shape_info=self.dynamic_shape_info,
use_calib=self.use_calib)
use_calib=self.use_calib,
collect_shape_range_info=self.collect_shape_range_info,
tuned_dynamic_shape_info=self.tuned_dynamic_shape_info)
service_handler.prepare_server() # get fetch_list
serivce_ports = service_handler.get_port_list()
self._server_endpoints = [
......@@ -290,7 +302,9 @@ class Op(object):
mkldnn_bf16_op_list=self.mkldnn_bf16_op_list,
min_subgraph_size=self.min_subgraph_size,
dynamic_shape_info=self.dynamic_shape_info,
use_calib=self.use_calib)
use_calib=self.use_calib,
collect_shape_range_info=self.collect_shape_range_info,
tuned_dynamic_shape_info=self.tuned_dynamic_shape_info)
if self._client_config is None:
self._client_config = service_handler.get_client_config(
)
......@@ -1387,7 +1401,9 @@ class Op(object):
mkldnn_bf16_op_list=mkldnn_bf16_op_list,
min_subgraph_size=min_subgraph_size,
dynamic_shape_info=dynamic_shape_info,
use_calib=use_calib)
use_calib=use_calib,
collect_shape_range_info=self.collect_shape_range_info,
tuned_dynamic_shape_info=self.tuned_dynamic_shape_info)
_LOGGER.info("Init cuda env in process {}".format(
concurrency_idx))
......
......@@ -261,6 +261,8 @@ class PipelineServer(object):
"use_mkldnn": False,
"mkldnn_cache_capacity": 0,
"min_subgraph_size": 3,
"collect_shape_range_info": "",
"tuned_dynamic_shape_info": "",
},
}
for op in self._used_op:
......@@ -422,6 +424,8 @@ class ServerYamlConfChecker(object):
"use_mkldnn": False,
"mkldnn_cache_capacity": 0,
"min_subgraph_size": 3,
"collect_shape_range_info": "",
"tuned_dynamic_shape_info": "",
}
conf_type = {
"model_config": str,
......@@ -438,6 +442,8 @@ class ServerYamlConfChecker(object):
"mkldnn_op_list": list,
"mkldnn_bf16_op_list": list,
"min_subgraph_size": int,
"collect_shape_range_info": str,
"tuned_dynamic_shape_info": str,
}
conf_qualification = {"thread_num": (">=", 1), }
ServerYamlConfChecker.check_conf(conf, default_conf, conf_type,
......
......@@ -42,6 +42,13 @@ message Request {
message Response {
repeated ModelOutput outputs = 1;
repeated int64 profile_time = 2;
bool profile_server = 3;
uint64 log_id = 4;
// Error code
int32 err_no = 5;
// Error messages
string err_msg = 6;
};
message ModelOutput {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册