diff --git a/core/predictor/framework/prometheus_metric.cpp b/core/predictor/framework/prometheus_metric.cpp index 5ddd0d1df173ab11338c41644cc11fc6cbaaaf47..db4f2f4551482cdeee4fb2f24083a92f862c50f3 100644 --- a/core/predictor/framework/prometheus_metric.cpp +++ b/core/predictor/framework/prometheus_metric.cpp @@ -30,26 +30,26 @@ PrometheusMetric::PrometheusMetric() serializer_(new prometheus::TextSerializer()), query_success_family_( prometheus::BuildCounter() - .Name("pd_query_request_success") + .Name("pd_query_request_success_total") .Help("Number of successful query requests") .Register(*registry_)), query_failure_family_( prometheus::BuildCounter() - .Name("pd_query_request_failure") + .Name("pd_query_request_failure_total") .Help("Number of failed query requests") .Register(*registry_)), inf_count_family_(prometheus::BuildCounter() - .Name("pd_inference_count") + .Name("pd_inference_count_total") .Help("Number of inferences performed") .Register(*registry_)), query_duration_us_family_( prometheus::BuildCounter() - .Name("pd_query_request_duration_us") + .Name("pd_query_request_duration_us_total") .Help("Cummulative query request duration in microseconds") .Register(*registry_)), inf_duration_us_family_( prometheus::BuildCounter() - .Name("pd_inference_duration_us") + .Name("pd_inference_duration_us_total") .Help("Cummulative inference duration in microseconds") .Register(*registry_)), metrics_enabled_(false) diff --git a/doc/C++_Serving/Performance_Tuning_CN.md b/doc/C++_Serving/Performance_Tuning_CN.md index d2b2921b71a55624fd58e4ae8eb52eb07a9fd516..90ecc1e6713261897fc2ad08c4e34e43ea1c5209 100755 --- a/doc/C++_Serving/Performance_Tuning_CN.md +++ b/doc/C++_Serving/Performance_Tuning_CN.md @@ -58,3 +58,17 @@ Server端**线程数N**的设置需要结合三个因素来综合 ## 4.3 示例 请参考[examples/C++/PaddleOCR/ocr/README_CN.md](../../examples/C++/PaddleOCR/ocr/README_CN.md)中`C++ OCR Service服务章节`和[Paddle Serving中的集成预测](./Model_Ensemble_CN.md)中的例子。 + +# 5.请求缓存 +当**您的业务中有较多重复请求**时,您可以考虑使用C++Serving[Request Cache](./Request_Cache_CN.md)来提升服务性能 + +## 5.1 优点 +服务可以缓存请求结果,将请求数据与结果以键值对的形式保存。当有重复请求到来时,可以根据请求数据直接从缓存中获取结果并返回,而不需要进行模型预测等处理(耗时与请求数据大小有关,在毫秒量级)。 + +## 5.2 缺点 + +1) 需要额外的系统内存用于缓存请求结果,具体缓存大小可以通过启动参数进行配置。 +2) 对于未命中请求,会增加额外的时间用于根据请求数据检索缓存(耗时增加1%左右)。 + +## 5.3 示例 +请参考[Request Cache](./Request_Cache_CN.md)中的使用方法 \ No newline at end of file diff --git a/doc/Model_Zoo_CN.md b/doc/Model_Zoo_CN.md old mode 100644 new mode 100755 index 645523a2d6fbd01ee842dd146bb8bf290ba3ffd0..23472f26c8a60ad1113c3fcebcaeda7dc79d57c3 --- a/doc/Model_Zoo_CN.md +++ b/doc/Model_Zoo_CN.md @@ -10,6 +10,7 @@ | 模型 | 类型 | 示例使用的框架 | 下载 | | --- | --- | --- | ---- | +| pp_shitu | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/pp_shitu) | [.tar.gz](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/inference/serving/pp_shitu.tar.gz) | | resnet_v2_50_imagenet | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/resnet_v2_50)
[Pipeline Serving](../examples/Pipeline/PaddleClas/ResNet_V2_50) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/ImageClassification/resnet_v2_50_imagenet.tar.gz) | Pipeline Serving, C++ Serving| | mobilenet_v2_imagenet | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/mobilenet) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/ImageClassification/mobilenet_v2_imagenet.tar.gz) | | resnet50_vd | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/imagenet)
[Pipeline Serving](../examples/Pipeline/PaddleClas/ResNet50_vd) | [.tar.gz](https://paddle-serving.bj.bcebos.com/model/ResNet50_vd.tar) | @@ -27,6 +28,8 @@ | senta_bilstm | PaddleNLP | [C++ Serving](../examples/C++/PaddleNLP/senta) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SentimentAnalysis/senta_bilstm.tar.gz) |C++ Serving| | lac | PaddleNLP | [C++ Serving](../examples/C++/PaddleNLP/lac) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/LexicalAnalysis/lac.tar.gz) | | transformer | PaddleNLP | [Pipeline Serving](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/examples/machine_translation/transformer/deploy/serving/README.md) | [model](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/machine_translation/transformer) | +| ELECTRA | PaddleNLP | [Pipeline Serving](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/examples/language_model/electra/deploy/serving/README.md) | [model](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/electra) | +| In-batch Negatives | PaddleNLP | [Pipeline Serving](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search/recall/in_batch_negative) | [model](https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip) | | criteo_ctr | PaddleRec | [C++ Serving](../examples/C++/PaddleRec/criteo_ctr) | [.tar.gz](https://paddle-serving.bj.bcebos.com/criteo_ctr_example/criteo_ctr_demo_model.tar.gz) | | criteo_ctr_with_cube | PaddleRec | [C++ Serving](../examples/C++/PaddleRec/criteo_ctr_with_cube) | [.tar.gz](https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz) | | wide&deep | PaddleRec | [C++ Serving](https://github.com/PaddlePaddle/PaddleRec/blob/release/2.1.0/doc/serving.md) | [model](https://github.com/PaddlePaddle/PaddleRec/blob/release/2.1.0/models/rank/wide_deep/README.md) | @@ -66,4 +69,3 @@ - [PaddleRec](https://github.com/PaddlePaddle/PaddleRec) - [PaddleSeg](https://github.com/PaddlePaddle/PaddleSeg) - [PaddleGAN](https://github.com/PaddlePaddle/PaddleGAN) - diff --git a/doc/Model_Zoo_EN.md b/doc/Model_Zoo_EN.md old mode 100644 new mode 100755 index 67fcaf6dd5c535a7fc7c56336ac8ea9ccb996396..10baea39c3333f295c9be3b15a0f093ef1b5d0af --- a/doc/Model_Zoo_EN.md +++ b/doc/Model_Zoo_EN.md @@ -10,6 +10,7 @@ Special thanks to the [Padddle wholechain](https://www.paddlepaddle.org.cn/whole | Model | Type | Framework | Download | | --- | --- | --- | ---- | +| pp_shitu | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/pp_shitu) | [.tar.gz](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/inference/serving/pp_shitu.tar.gz) | | resnet_v2_50_imagenet | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/resnet_v2_50)
[Pipeline Serving](../examples/Pipeline/PaddleClas/ResNet_V2_50) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/ImageClassification/resnet_v2_50_imagenet.tar.gz) | Pipeline Serving, C++ Serving| | mobilenet_v2_imagenet | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/mobilenet) | [.tar.gz](https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/ImageClassification/mobilenet_v2_imagenet.tar.gz) | | resnet50_vd | PaddleClas | [C++ Serving](../examples/C++/PaddleClas/imagenet)
[Pipeline Serving](../examples/Pipeline/PaddleClas/ResNet50_vd) | [.tar.gz](https://paddle-serving.bj.bcebos.com/model/ResNet50_vd.tar) | diff --git a/doc/Prometheus_CN.md b/doc/Prometheus_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..4807d8c7029b0f5252aa632a92511b762002c4ff --- /dev/null +++ b/doc/Prometheus_CN.md @@ -0,0 +1,95 @@ +## Paddle Serving使用普罗米修斯监控 + +Paddle Serving支持普罗米修斯进行性能数据的监控。默认的访问接口为`http://localhost:19393/metrics`。数据形式为文本格式,您可以使用如下命令直观的看到: +``` +curl http://localhost:19393/metrics +``` + +## 配置使用 + +### C+ Server + +对于 C++ Server 来说,启动服务时请添加如下参数 + +| 参数 | 参数说明 | 备注 | +| :------- | :-------------------------- | :--------------------------------------------------------------- | +| enable_prometheus | 开启Prometheus | 开启Prometheus功能 | +| prometheus_port | Prometheus数据端口 | 默认为19393 | + +### Python Pipeline + +对于 Python Pipeline 来说,启动服务时请在配置文件config.yml中添加如下参数 +``` +dag: + #开启Prometheus + enable_prometheus: True + #配置Prometheus数据端口 + prometheus_port: 19393 +``` + +### 监控数据类型 + +监控数据类型如下表 + +| Metric | Frequency | Description | +| ---------------------------------------------- | ----------- | ----------------------------------------------------- | +| `pd_query_request_success_total` | Per request | Number of successful query requests | +| `pd_query_request_failure_total` | Per request | Number of failed query requests | +| `pd_inference_count_total` | Per request | Number of inferences performed | +| `pd_query_request_duration_us_total` | Per request | Cumulative end-to-end query request handling time | +| `pd_inference_duration_us_total` | Per request | Cumulative time requests spend executing the inference model | + +## 监控示例 + +此处给出一个使用普罗米修斯进行服务监控的简单示例 + +**1、获取镜像** + +``` +docker pull prom/node-exporter +docker pull prom/prometheus +``` + +**2、运行镜像** + +``` +docker run -d -p 9100:9100 \ + -v "/proc:/host/proc:ro" \ + -v "/sys:/host/sys:ro" \ + -v "/:/rootfs:ro" \ + --net="host" \ + prom/node-exporter +``` + +**3、配置** + +修改监控服务的配置文件/opt/prometheus/prometheus.yml,添加监控节点信息 + +``` +global: + scrape_interval: 60s + evaluation_interval: 60s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ['localhost:9090'] + labels: + instance: prometheus + + - job_name: linux + static_configs: + - targets: ['$IP:9100'] + labels: + instance: localhost +``` + +**4、启动监控服务** + +``` +docker run -d \ + -p 9090:9090 \ + -v /opt/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \ + prom/prometheus +``` +访问 `http://serverip:9090/graph` 即可 \ No newline at end of file diff --git a/doc/Run_On_DCU_CN.md b/doc/Run_On_DCU_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..fe8a12d5cc901d9a8da6d13326251508ab72f3d1 --- /dev/null +++ b/doc/Run_On_DCU_CN.md @@ -0,0 +1,69 @@ +## Paddle Serving使用海光芯片部署 + +Paddle Serving支持使用海光DCU进行预测部署。目前支持的ROCm版本为4.0.1。 + +## 安装Docker镜像 +我们推荐使用docker部署Serving服务,可以直接从Paddle的官方镜像库拉取预先装有ROCm4.0.1的docker镜像。 +``` +# 拉取镜像 +docker pull paddlepaddle/paddle:latest-dev-rocm4.0-miopen2.11 + +# 启动容器,注意这里的参数,例如shm-size, device等都需要配置 +docker run -it --name paddle-rocm-dev --shm-size=128G \ + --device=/dev/kfd --device=/dev/dri --group-add video \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + paddlepaddle/paddle:latest-dev-rocm4.0-miopen2.11 /bin/bash + +# 检查容器是否可以正确识别海光DCU设备 +rocm-smi + +# 预期得到以下结果: +======================= ROCm System Management Interface ======================= +================================= Concise Info ================================= +GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0 50.0c 23.0W 1319Mhz 800Mhz 0.0% auto 300.0W 0% 0% +1 48.0c 25.0W 1319Mhz 800Mhz 0.0% auto 300.0W 0% 0% +2 48.0c 24.0W 1319Mhz 800Mhz 0.0% auto 300.0W 0% 0% +3 49.0c 27.0W 1319Mhz 800Mhz 0.0% auto 300.0W 0% 0% +================================================================================ +============================= End of ROCm SMI Log ============================== +``` + +## 编译、安装 +基本环境配置可参考[该文档](Compile_CN.md)进行配置。 +### 编译 +* 编译server部分 +``` +cd Serving +mkdir -p server-build-dcu && cd server-build-dcu + +cmake -DPYTHON_INCLUDE_DIR=/opt/conda/include/python3.7m/ \ + -DPYTHON_LIBRARIES=/opt/conda/lib/libpython3.7m.so \ + -DPYTHON_EXECUTABLE=/opt/conda/bin/python \ + -DWITH_MKL=ON \ + -DWITH_ROCM=ON \ + -DSERVER=ON .. +make -j10 +``` + +### 安装wheel包 +编译步骤完成后,会在各自编译目录$build_dir/python/dist生成whl包,分别安装即可。例如server步骤,会在server-build-arm/python/dist目录下生成whl包, 使用命令```pip install -u xxx.whl```进行安装。 + + +## 部署使用示例 +以[resnet50](../examples/C++/PaddleClas/resnet_v2_50/README_CN.md)为例 + +### 启动rpc服务 + +启动rpc服务,基于1卡部署 +``` +python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --gpu_ids 1 +``` + +## 其他说明 + +### 模型实例及说明 +支持海光芯片部署模型列表见[链接](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/09_hardware_support/rocm_docs/paddle_rocm_cn.html)。不同模型适配上存在差异,可能存在不支持的情况,部署使用存在问题时,欢迎以[Github issue](https://github.com/PaddlePaddle/Serving/issues),我们会实时跟进。 + +### 昆仑芯片支持相关参考资料 +* [海光芯片运行飞桨](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/09_hardware_support/rocm_docs/paddle_install_cn.html) \ No newline at end of file diff --git a/doc/Run_On_JETSON_CN.md b/doc/Run_On_JETSON_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..b3549ebb2845aa2dd18bd3bca8b0101bfb909b8f --- /dev/null +++ b/doc/Run_On_JETSON_CN.md @@ -0,0 +1,42 @@ +## Paddle Serving使用JETSON部署 + +Paddle Serving支持使用JETSON进行预测部署。目前仅支持Pipeline模式。 + +### 安装PaddlePaddle + +可以参考[NV Jetson部署示例](https://paddleinference.paddlepaddle.org.cn/demo_tutorial/cuda_jetson_demo.html)安装python版本的paddlepaddle + + +### 安装PaddleServing + +安装ARM版本的whl包 +``` +# paddle-serving-server +https://paddle-serving.bj.bcebos.com/whl/xpu/arm/paddle_serving_server_xpu-0.0.0.post2-py3-none-any.whl +# paddle-serving-client +https://paddle-serving.bj.bcebos.com/whl/xpu/arm/paddle_serving_client-0.0.0-cp36-none-any.whl +# paddle-serving-app +https://paddle-serving.bj.bcebos.com/whl/xpu/arm/paddle_serving_app-0.0.0-py3-none-any.whl +``` + +### 部署使用 + +以[Uci](../examples/Pipeline/simple_web_service/README_CN.md)为例 + +启动服务 +``` +python3 web_service.py &>log.txt & +``` +其中修改config.yml中的对应配置项 +``` + #计算硬件类型: 空缺时由devices决定(CPU/GPU),0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu + device_type: 1 + + #计算硬件ID,优先由device_type决定硬件类型。devices为""或空缺时为CPU预测;当为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 + devices: "0,1" +``` + +## 其他说明 + +### Jetson支持相关参考资料 +* [Jetson运行飞桨](https://paddleinference.paddlepaddle.org.cn/demo_tutorial/cuda_jetson_demo.html) \ No newline at end of file diff --git a/doc/Run_On_NPU_CN.md b/doc/Run_On_NPU_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..2919ae7290071b409852634e4274911d8f46992b --- /dev/null +++ b/doc/Run_On_NPU_CN.md @@ -0,0 +1,196 @@ +## Paddle Serving使用昇腾NPU芯片部署 + +Paddle Serving支持使用昇腾NPU芯片进行预测部署。目前支持在昇腾芯片(910/310)和arm服务器上进行部署,后续完善对其他异构硬件服务器部署能力。 + +## 昇腾910 + +### 安装Docker镜像 +我们推荐使用docker部署Serving服务,可以直接从Paddle的官方镜像库拉取预先装有 CANN 社区版 5.0.2.alpha005 的 docker 镜像。 +``` +# 拉取镜像 +docker pull paddlepaddle/paddle:latest-dev-cann5.0.2.alpha005-gcc82-aarch64 + +# 启动容器,注意这里的参数 --device,容器仅映射设备ID为4到7的4张NPU卡,如需映射其他卡相应增改设备ID号即可 +docker run -it --name paddle-npu-dev -v /home/:/workspace \ + --pids-limit 409600 --network=host --shm-size=128G \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + --device=/dev/davinci4 --device=/dev/davinci5 \ + --device=/dev/davinci6 --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/dcmi:/usr/local/dcmi \ + paddlepaddle/paddle:latest-dev-cann5.0.2.alpha005-gcc82-aarch64 /bin/bash + +# 检查容器中是否可以正确识别映射的昇腾DCU设备 +npu-smi info + +# 预期得到类似如下的结果 ++------------------------------------------------------------------------------------+ +| npu-smi 1.9.3 Version: 21.0.rc1 | ++----------------------+---------------+---------------------------------------------+ +| NPU Name | Health | Power(W) Temp(C) | +| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) | ++======================+===============+=============================================+ +| 4 910A | OK | 67.2 30 | +| 0 | 0000:C2:00.0 | 0 303 / 15171 0 / 32768 | ++======================+===============+=============================================+ +| 5 910A | OK | 63.8 25 | +| 0 | 0000:82:00.0 | 0 2123 / 15171 0 / 32768 | ++======================+===============+=============================================+ +| 6 910A | OK | 67.1 27 | +| 0 | 0000:42:00.0 | 0 1061 / 15171 0 / 32768 | ++======================+===============+=============================================+ +| 7 910A | OK | 65.5 30 | +| 0 | 0000:02:00.0 | 0 2563 / 15078 0 / 32768 | ++======================+===============+=============================================+ +``` + +### 编译、安装 +基本环境配置可参考[该文档](Compile_CN.md)进行配置。 + +***1、依赖安装*** + +安装编译所需依赖库,包括patchelf、libcurl等 +``` +apt-get install patchelf libcurl4-openssl-dev libbz2-dev libgeos-dev +``` + +***2、GOLANG环境配置*** + +下载并配置ARM版本的GOLANG-1.17.2 +``` +wget https://golang.org/dl/go1.17.2.linux-arm64.tar.gz +tar zxvf go1.17.2.linux-arm64.tar.gz -C /usr/local/ +mkdir /root/go /root/go/bin /root/go/src +echo "GOROOT=/usr/local/go" >> /root/.bashrc +echo "GOPATH=/root/go" >> /root/.bashrc +echo "PATH=/usr/local/go/bin:/root/go/bin:$PATH" >> /root/.bashrc +source /root/.bashrc + +go env -w GO111MODULE=on +go env -w GOPROXY=https://goproxy.cn,direct +go install github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2 +go install github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2 +go install github.com/golang/protobuf/protoc-gen-go@v1.4.3 +go install google.golang.org/grpc@v1.33.0 +go env -w GO111MODULE=auto +``` + +***3、PYTHON环境配置*** + +下载python依赖库并配置环境 +``` +pip3.7 install -r python/requirements.txt -i https://mirror.baidu.com/pypi/simple +export PYTHONROOT=/opt/conda +export PYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.7m +export PYTHON_LIBRARIES=$PYTHONROOT/lib/libpython3.7m.so +export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.7 +``` + +***4、编译server*** + +``` +mkdir build-server-npu && cd build-server-npu +cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR/ \ + -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \ + -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ + -DCMAKE_INSTALL_PREFIX=./output \ + -DWITH_ASCEND_CL=ON \ + -DSERVER=ON .. +make TARGET=ARMV8 -j16 +``` + +***5、安装编译包*** + +编译步骤完成后,会在各自编译目录$build_dir/python/dist生成whl包,分别安装即可。例如server步骤,会在server-build-npu/python/dist目录下生成whl包, 使用命令```pip install -u xxx.whl```进行安装。 + +### 部署使用 +为了支持arm+昇腾910服务部署,启动服务时需使用以下参数。 +| 参数 | 参数说明 | 备注 | +| :------- | :-------------------------- | :--------------------------------------------------------------- | +| use_ascend_cl | 使用Ascend CL进行预测 | 使用Ascend预测能力 | + +以[Bert](../examples/C++/PaddleNLP/bert/README_CN.md)为例 + +启动rpc服务,使用Ascend npu优化加速能力 +``` +python3 -m paddle_serving_server.serve --model bert_seq128_model --thread 6 --port 9292 --use_ascend_cl +``` + +## 昇腾310 + +### 安装Docker镜像 +我们推荐使用docker部署Serving服务,可以拉取装有 CANN 3.3.0 docker 镜像。 +``` +# 拉取镜像 +docker pull registry.baidubce.com/paddlepaddle/serving:ascend-aarch64-cann3.3.0-paddlelite-devel + +# 启动容器,注意这里的参数 --device,容器仅映射设备ID为4到7的4张NPU卡,如需映射其他卡相应增改设备ID号即可 +docker run -it --name paddle-npu-dev -v /home/:/workspace \ + --pids-limit 409600 --network=host --shm-size=128G \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + --device=/dev/davinci4 --device=/dev/davinci5 \ + --device=/dev/davinci6 --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/dcmi:/usr/local/dcmi \ + registry.baidubce.com/paddlepaddle/serving:ascend-aarch64-cann3.3.0-paddlelite-devel /bin/bash + +``` + +### 编译、安装 +基本环境配置可参考[该文档](Compile_CN.md)进行配置。 + +***1、PYTHON环境配置*** + +下载python依赖库并配置环境 +``` +pip3.7 install -r python/requirements.txt -i https://mirror.baidu.com/pypi/simple +export PYTHONROOT=/usr/local/python3.7.5 +export PYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.7m +export PYTHON_LIBRARIES=$PYTHONROOT/lib/libpython3.7m.so +export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.7 +``` + +***2、编译server*** + +``` +mkdir build-server-npu && cd build-server-npu +cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR/ \ + -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \ + -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ + -DCMAKE_INSTALL_PREFIX=./output \ + -DWITH_ASCEND_CL=ON \ + -DWITH_LITE=ON \ + -DSERVER=ON .. +make TARGET=ARMV8 -j16 +``` + +***3、安装编译包*** + +编译步骤完成后,会在各自编译目录$build_dir/python/dist生成whl包,分别安装即可。例如server步骤,会在server-build-npu/python/dist目录下生成whl包, 使用命令```pip install -u xxx.whl```进行安装。 + +### 部署使用 +为了支持arm+昇腾310服务部署,启动服务时需使用以下参数。 +| 参数 | 参数说明 | 备注 | +| :------- | :-------------------------- | :--------------------------------------------------------------- | +| use_ascend_cl | 使用Ascend CL进行预测 | 使用Ascend预测能力 | +| use_lite | 使用Paddle-Lite Engine | 使用Paddle-Lite cpu预测能力 | + +以[resnet50](../examples/C++/PaddleClas/resnet_v2_50/README_CN.md)为例 + +启动rpc服务,使用Paddle-Lite npu优化加速能力 +``` +python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --thread 6 --port 9292 --use_ascend_cl --use_lite +``` + +## 其他说明 + +### NPU芯片支持相关参考资料 +* [昇腾NPU芯片运行飞桨](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/09_hardware_support/npu_docs/paddle_install_cn.html) \ No newline at end of file diff --git a/doc/Serving_Configure_CN.md b/doc/Serving_Configure_CN.md index c84ef9d78ce4960a72e1802869450122a75f95be..5b42221a894de54c4c46e23c254f62d464c9bc4f 100644 --- a/doc/Serving_Configure_CN.md +++ b/doc/Serving_Configure_CN.md @@ -364,11 +364,41 @@ dag: tracer: interval_s: 10 + #client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测 + #client_type: local_predictor + + #channel的最大长度,默认为0 + #channel_size: 0 + + #针对大模型分布式场景tensor并行,接收第一个返回结果后其他结果丢弃来提供速度 + #channel_recv_frist_arrive: False + op: det: #并发数,is_thread_op=True时,为线程并发;否则为进程并发 concurrency: 6 + #Serving IPs + #server_endpoints: ["127.0.0.1:9393"] + + #Fetch结果列表,以client_config中fetch_var的alias_name为准 + #fetch_list: ["concat_1.tmp_0"] + + #det模型client端配置 + #client_config: serving_client_conf.prototxt + + #Serving交互超时时间, 单位ms + #timeout: 3000 + + #Serving交互重试次数,默认不重试 + #retry: 1 + + # 批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout,否则不足batch_size时会阻塞 + #batch_size: 2 + + # 批量查询超时,与batch_size配合使用 + #auto_batching_timeout: 2000 + #当op配置没有server_endpoints时,从local_service_conf读取本地服务配置 local_service_conf: #client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测 @@ -392,10 +422,34 @@ op: #ir_optim, 开启TensorRT时,必须同时设置ir_optim=True,否则无效 ir_optim: True + #CPU 计算线程数,在CPU场景开启会降低单次请求响应时长 + #thread_num: 10 + #precsion, 预测精度,降低预测精度可提升预测速度 #GPU 支持: "fp32"(default), "fp16", "int8"; #CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8" precision: "fp32" + + #mem_optim, memory / graphic memory optimization + #mem_optim: True + + #use_calib, Use TRT int8 calibration + #use_calib: False + + #use_mkldnn, Use mkldnn for cpu + #use_mkldnn: False + + #The cache capacity of different input shapes for mkldnn + #mkldnn_cache_capacity: 0 + + #mkldnn_op_list, op list accelerated using MKLDNN, None default + #mkldnn_op_list: [] + + #mkldnn_bf16_op_list,op list accelerated using MKLDNN bf16, None default. + #mkldnn_bf16_op_list: [] + + #min_subgraph_size,the minimal subgraph size for opening tensorrt to optimize, 3 default + #min_subgraph_size: 3 rec: #并发数,is_thread_op=True时,为线程并发;否则为进程并发 concurrency: 3 @@ -430,6 +484,9 @@ op: #ir_optim, 开启TensorRT时,必须同时设置ir_optim=True,否则无效 ir_optim: True + #CPU 计算线程数,在CPU场景开启会降低单次请求响应时长 + #thread_num: 10 + #precsion, 预测精度,降低预测精度可提升预测速度 #GPU 支持: "fp32"(default), "fp16", "int8"; #CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8" diff --git a/doc/Serving_Configure_EN.md b/doc/Serving_Configure_EN.md index ddf81ea7652356fd3b7a65af4f1ba1a4c55f6db0..1a3e4bd180c7d2e5c39aca550960c6fb8c551e6e 100644 --- a/doc/Serving_Configure_EN.md +++ b/doc/Serving_Configure_EN.md @@ -369,11 +369,41 @@ dag: tracer: interval_s: 10 + #client type,include brpc, grpc and local_predictor. + #client_type: local_predictor + + # max channel size, default 0 + #channel_size: 0 + + #For distributed large model scenario with tensor parallelism, the first result is received and the other results are discarded to provide speed + #channel_recv_frist_arrive: False + op: det: #concurrency,is_thread_op=True,thread otherwise process concurrency: 6 + #Serving IPs + #server_endpoints: ["127.0.0.1:9393"] + + #Fetch data list + #fetch_list: ["concat_1.tmp_0"] + + #det client config + #client_config: serving_client_conf.prototxt + + #Serving timeout, ms + #timeout: 3000 + + #Serving retry times + #retry: 1 + + #Default 1。batch_size>1 should set auto_batching_timeout + #batch_size: 2 + + #Batching timeout,used with batch_size + #auto_batching_timeout: 2000 + #Loading local server configuration without server_endpoints. local_service_conf: #client type,include brpc, grpc and local_predictor. @@ -397,10 +427,34 @@ op: #ir_optim, When running on TensorRT,must set ir_optim=True ir_optim: True + #CPU 计算线程数,在CPU场景开启会降低单次请求响应时长 + #thread_num: 10 + #precsion, Decrease accuracy can increase speed #GPU 支持: "fp32"(default), "fp16", "int8"; #CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8" precision: "fp32" + + #mem_optim, memory / graphic memory optimization + #mem_optim: True + + #use_calib, Use TRT int8 calibration + #use_calib: False + + #use_mkldnn, Use mkldnn for cpu + #use_mkldnn: False + + #The cache capacity of different input shapes for mkldnn + #mkldnn_cache_capacity: 0 + + #mkldnn_op_list, op list accelerated using MKLDNN, None default + #mkldnn_op_list: [] + + #mkldnn_bf16_op_list,op list accelerated using MKLDNN bf16, None default. + #mkldnn_bf16_op_list: [] + + #min_subgraph_size,the minimal subgraph size for opening tensorrt to optimize, 3 default + #min_subgraph_size: 3 rec: #concurrency,is_thread_op=True,thread otherwise process concurrency: 3 @@ -435,6 +489,9 @@ op: #ir_optim, When running on TensorRT,must set ir_optim=True ir_optim: True + #CPU 计算线程数,在CPU场景开启会降低单次请求响应时长 + #thread_num: 10 + #precsion, Decrease accuracy can increase speed #GPU 支持: "fp32"(default), "fp16", "int8"; #CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8" diff --git a/doc/images/wechat_group_1.jpeg b/doc/images/wechat_group_1.jpeg index cf7428a9ae3e0892ab696dbbe86ebf45c5b204d0..daa619adaea3c86e7e43180ef92551cb992332af 100644 Binary files a/doc/images/wechat_group_1.jpeg and b/doc/images/wechat_group_1.jpeg differ diff --git a/examples/C++/PaddleNLP/bert/README.md b/examples/C++/PaddleNLP/bert/README.md index 5d3242837f6d8be08f321d68890587e4bba725e8..64f6441f28776114346f99fc16b363aa18276f4c 100755 --- a/examples/C++/PaddleNLP/bert/README.md +++ b/examples/C++/PaddleNLP/bert/README.md @@ -1,4 +1,4 @@ -Http## Bert as service +## Bert as service ([简体中文](./README_CN.md)|English) diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py index 0bdfe7490d52433f5bd5f68c4454822c17675580..f9a50ea37d4cc75b92cf2d62954c3c0de88e8094 100755 --- a/python/paddle_serving_server/serve.py +++ b/python/paddle_serving_server/serve.py @@ -37,6 +37,11 @@ from paddle_serving_server.util import * from paddle_serving_server.env_check.run import check_env import cmd +def signal_handler(signal, frame): + print('Process stopped') + sys.exit(0) + +signal.signal(signal.SIGINT, signal_handler) # web_service.py is still used by Pipeline. def port_is_available(port): diff --git a/python/pipeline/dag.py b/python/pipeline/dag.py index 1275bc704884f6d497da446d03914e24f62fd8ea..d7020ae7e937380fa03e4dac3b68559a2fbca745 100644 --- a/python/pipeline/dag.py +++ b/python/pipeline/dag.py @@ -62,6 +62,13 @@ class DAGExecutor(object): self._retry = dag_conf["retry"] self._server_use_profile = dag_conf["use_profile"] + self._enable_prometheus = False + if "enable_prometheus" in dag_conf: + self._enable_prometheus = dag_conf["enable_prometheus"] + if "prometheus_port" in dag_conf and self._enable_prometheus: + self._prometheus_port = dag_conf["prometheus_port"] + else: + self._prometheus_port = None channel_size = dag_conf["channel_size"] channel_recv_frist_arrive = dag_conf["channel_recv_frist_arrive"] self._is_thread_op = dag_conf["is_thread_op"] @@ -77,8 +84,10 @@ class DAGExecutor(object): if tracer_interval_s >= 1: self._tracer = PerformanceTracer( self._is_thread_op, tracer_interval_s, server_worker_num) + if self._enable_prometheus: + self._tracer.set_enable_dict(True) - self._dag = DAG(self.name, response_op, self._server_use_profile, + self._dag = DAG(self.name, response_op, self._server_use_profile, self._prometheus_port, self._is_thread_op, channel_size, build_dag_each_worker, self._tracer, channel_recv_frist_arrive) (in_channel, out_channel, pack_rpc_func, @@ -480,10 +489,10 @@ class DAG(object): """ Directed Acyclic Graph(DAG) engine, builds one DAG topology. """ - def __init__(self, request_name, response_op, use_profile, is_thread_op, + def __init__(self, request_name, response_op, use_profile, prometheus_port, is_thread_op, channel_size, build_dag_each_worker, tracer, channel_recv_frist_arrive): - _LOGGER.info("{}, {}, {}, {}, {} ,{} ,{} ,{}".format(request_name, response_op, use_profile, is_thread_op, + _LOGGER.info("{}, {}, {}, {}, {}, {} ,{} ,{} ,{}".format(request_name, response_op, use_profile, prometheus_port, is_thread_op, channel_size, build_dag_each_worker, tracer, channel_recv_frist_arrive)) @ErrorCatch @@ -491,6 +500,7 @@ class DAG(object): def init_helper(self, request_name: str, response_op, use_profile: [bool, None], + prometheus_port: [int, None], is_thread_op: bool, channel_size, build_dag_each_worker: [bool, None], @@ -499,6 +509,8 @@ class DAG(object): self._request_name = request_name self._response_op = response_op self._use_profile = use_profile + self._prometheus_port = prometheus_port + self._use_prometheus = (self._prometheus_port is not None) self._is_thread_op = is_thread_op self._channel_size = channel_size self._build_dag_each_worker = build_dag_each_worker @@ -506,7 +518,7 @@ class DAG(object): self._channel_recv_frist_arrive = channel_recv_frist_arrive if not self._is_thread_op: self._manager = PipelineProcSyncManager() - init_helper(self, request_name, response_op, use_profile, is_thread_op, + init_helper(self, request_name, response_op, use_profile, prometheus_port, is_thread_op, channel_size, build_dag_each_worker, tracer, channel_recv_frist_arrive) print("[DAG] Succ init") @@ -828,6 +840,56 @@ class DAG(object): return self._input_channel, self._output_channel, self._pack_func, self._unpack_func + def start_prom(self, prometheus_port): + import prometheus_client + from prometheus_client import Counter + from prometheus_client.core import CollectorRegistry + + from flask import Response, Flask + from .prometheus_metrics import registry + from .prometheus_metrics import metric_query_success, metric_query_failure, metric_inf_count, metric_query_duration_us, metric_inf_duration_us + app = Flask(__name__) + # requests_total = Counter('c1','A counter') + + @app.route("/metrics") + def requests_count(): + item = self._tracer.profile_dict + _LOGGER.info("metrics: {}".format(item)) + # {'uci': {'in': 727.443, 'prep': 0.5525833333333333, 'midp': 2.21375, 'postp': 1.32375, 'out': 0.9396666666666667}, 'DAG': {'call_0': 29.479, 'call_1': 8.176, 'call_2': 8.045, 'call_3': 7.988, 'call_4': 7.609, 'call_5': 7.629, 'call_6': 7.625, 'call_7': 8.32, 'call_8': 8.57, 'call_9': 8.055, 'call_10': 7.915, 'call_11': 7.873, 'query_count': 12, 'qps': 1.2, 'succ': 1.0, 'avg': 9.773666666666667, '50': 8.045, '60': 8.055, '70': 8.176, '80': 8.32, '90': 8.57, '95': 29.479, '99': 29.479}} + if "DAG" in item: + total = item["DAG"]["query_count"] + succ = total * item["DAG"]["succ"] + fail = total * (1 - item["DAG"]["succ"]) + query_duration = total *item["DAG"]["avg"] + metric_query_success.inc(succ) + metric_query_failure._value.inc(fail) + metric_query_duration_us._value.inc(query_duration) + + inf_cnt = 0 + infer_duration = 0.0 + for name in item: + if name != "DAG": + if "count" in item[name]: + inf_cnt += item[name]["count"] + if "midp" in item[name]: + infer_duration += item[name]["count"]*item[name]["midp"] + metric_inf_count._value.inc(inf_cnt) + metric_inf_duration_us._value.inc(infer_duration) + + #return str(item) + self._tracer.profile_dict = {} + return Response(prometheus_client.generate_latest(registry),mimetype="text/plain") + + def prom_run(): + app.run(host="0.0.0.0",port=prometheus_port) + + p = threading.Thread( + target=prom_run, + args=()) + _LOGGER.info("Prometheus Start 2") + p.daemon = True + p.start() + def start(self): """ Each OP starts a thread or process by _is_thread_op @@ -842,12 +904,16 @@ class DAG(object): for op in self._actual_ops: op.use_profiler(self._use_profile) op.set_tracer(self._tracer) + op.set_use_prometheus(self._use_prometheus) if self._is_thread_op: self._threads_or_proces.extend(op.start_with_thread()) else: self._threads_or_proces.extend(op.start_with_process()) _LOGGER.info("[DAG] start") - + if self._use_prometheus: + _LOGGER.info("Prometheus Start 1") + self.start_prom(self._prometheus_port) + # not join yet return self._threads_or_proces diff --git a/python/pipeline/operator.py b/python/pipeline/operator.py index 9407717a49f54efc982b098ca9dc9008f9713c53..5c8acc1e9fdf0b468eb3822d467c680757f46b5a 100644 --- a/python/pipeline/operator.py +++ b/python/pipeline/operator.py @@ -371,6 +371,9 @@ class Op(object): def set_tracer(self, tracer): self._tracer = tracer + def set_use_prometheus(self, use_prometheus): + self._use_prometheus = use_prometheus + def init_client(self, client_config, server_endpoints): """ Initialize the client object. There are three types of clients, brpc, @@ -1448,6 +1451,7 @@ class Op(object): midped_data_dict, err_channeldata_dict \ = self._run_process(preped_data_dict, op_info_prefix, skip_process_dict, logid_dict) end = profiler.record("midp#{}_1".format(op_info_prefix)) + _LOGGER.info("prometheus inf count +1") midp_time = end - start _LOGGER.debug("op:{} process_end:{}, cost:{}".format( op_info_prefix, time.time(), midp_time)) diff --git a/python/pipeline/profiler.py b/python/pipeline/profiler.py index 6e38cb313d20962883cda192504e146ee31b1c95..2318e4e8fc8f5820197ce752263ed6f4fe45d1e8 100644 --- a/python/pipeline/profiler.py +++ b/python/pipeline/profiler.py @@ -49,13 +49,18 @@ class PerformanceTracer(object): self._channels = [] # The size of data in Channel will not exceed server_worker_num self._server_worker_num = server_worker_num - if _is_profile: - self.profile_dict = {} + self.profile_dict = {} + self._enable_dict = False def data_buffer(self): return self._data_buffer def start(self): + self._thrd = threading.Thread( + target=self._trace_func, args=(self._channels, )) + self._thrd.daemon = True + self._thrd.start() + """ if self._is_thread_mode: self._thrd = threading.Thread( target=self._trace_func, args=(self._channels, )) @@ -66,10 +71,14 @@ class PerformanceTracer(object): target=self._trace_func, args=(self._channels, )) self._proc.daemon = True self._proc.start() + """ def set_channels(self, channels): self._channels = channels + def set_enable_dict(self, enable): + self._enable_dict = enable + def _trace_func(self, channels): all_actions = ["in", "prep", "midp", "postp", "out"] calcu_actions = ["prep", "midp", "postp"] @@ -106,9 +115,14 @@ class PerformanceTracer(object): if len(op_cost) != 0: for name in op_cost: tot_cost, calcu_cost = 0.0, 0.0 + count = 0 for action, costs in op_cost[name].items(): op_cost[name][action] = sum(costs) / (1e3 * len(costs)) tot_cost += op_cost[name][action] + if action == "midp": + count = len(costs) + if "midp" in op_cost[name].keys(): + op_cost[name]['count'] = count if name != "DAG": _LOGGER.info("Op({}):".format(name)) @@ -121,8 +135,7 @@ class PerformanceTracer(object): calcu_cost += op_cost[name][action] _LOGGER.info("\tidle[{}]".format(1 - 1.0 * calcu_cost / tot_cost)) - if _is_profile: - self.profile_dict = copy.deepcopy(op_cost) + self.profile_dict = copy.deepcopy(op_cost) if "DAG" in op_cost: calls = list(op_cost["DAG"].values()) @@ -142,7 +155,7 @@ class PerformanceTracer(object): for latency in latencys: _LOGGER.info("\t\t.{}[{} ms]".format(latency, calls[int( tot * latency / 100.0)])) - if _is_profile: + if _is_profile or self._enable_dict: self.profile_dict["DAG"]["query_count"] = tot self.profile_dict["DAG"]["qps"] = qps self.profile_dict["DAG"]["succ"] = 1 - 1.0 * err_count / tot diff --git a/python/pipeline/prometheus_metrics.py b/python/pipeline/prometheus_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..728153749600bb7ad8b3e97e5c4569a661f39454 --- /dev/null +++ b/python/pipeline/prometheus_metrics.py @@ -0,0 +1,8 @@ +from prometheus_client import Counter, generate_latest, CollectorRegistry, Gauge + +registry = CollectorRegistry() +metric_query_success = Counter("pd_query_request_success_total", "metric_query_success", registry=registry) +metric_query_failure = Counter("pd_query_request_failure_total", "metric_query_failure", registry=registry) +metric_inf_count = Counter("pd_inference_count_total", "metric_inf_count", registry=registry) +metric_query_duration_us = Counter("pd_query_request_duration_us_total", "metric_query_duration_us", registry=registry) +metric_inf_duration_us = Counter("pd_inference_duration_us_total", "metric_inf_duration_us", registry=registry) diff --git a/python/requirements.txt b/python/requirements.txt index ddb2e1360527f0b8adb88ff3d347df31ec47af28..094d2f98213a747783461e5c6050e525d3bfdb7f 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,5 +1,5 @@ numpy>=1.12, <=1.16.4 ; python_version<"3.5" -shapely==1.7.0 +shapely==1.8.0 wheel>=0.34.0, <0.35.0 setuptools>=44.1.0 google>=2.0.3 @@ -16,8 +16,9 @@ pyclipper==1.2.1 MarkupSafe==1.1.1 Werkzeug==1.0.1 ujson>=2.0.3 -sentencepiece==0.1.92; platform_machine != "aarch64" +sentencepiece==0.1.96; platform_machine != "aarch64" sentencepiece; platform_machine == "aarch64" -opencv-python==4.2.0.32; platform_machine != "aarch64" +opencv-python==4.3.0.38; platform_machine != "aarch64" opencv-python; platform_machine == "aarch64" pytest +prometheus-client==0.12.0 \ No newline at end of file diff --git a/python/setup.py.app.in b/python/setup.py.app.in index 0d4763dfd448f12d23d26671a568f625d42ab7e7..79ddca2744bc014efe98f6040365079405c5fd07 100644 --- a/python/setup.py.app.in +++ b/python/setup.py.app.in @@ -44,9 +44,9 @@ REQUIRED_PACKAGES = [ 'six >= 1.10.0', 'pillow', 'pyclipper', 'shapely', - 'sentencepiece<=0.1.92; platform_machine != "aarch64"', + 'sentencepiece<=0.1.96; platform_machine != "aarch64"', 'sentencepiece; platform_machine == "aarch64"', - 'opencv-python<=4.2.0.32; platform_machine != "aarch64"', + 'opencv-python<=4.3.0.38; platform_machine != "aarch64"', 'opencv-python; platform_machine == "aarch64"', ]