Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleSlim
提交
dfcc3c2b
P
PaddleSlim
项目概览
PaddlePaddle
/
PaddleSlim
1 年多 前同步成功
通知
51
Star
1434
Fork
344
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
53
列表
看板
标记
里程碑
合并请求
16
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleSlim
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
53
Issue
53
列表
看板
标记
里程碑
合并请求
16
合并请求
16
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
dfcc3c2b
编写于
8月 16, 2022
作者:
G
Guanghua Yu
提交者:
GitHub
8月 16, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Unify YOLO series act demo (#1344)
上级
407d47b3
变更
45
隐藏空白更改
内联
并排
Showing
45 changed file
with
102 addition
and
4085 deletion
+102
-4085
README.md
README.md
+1
-1
example/auto_compression/README.md
example/auto_compression/README.md
+4
-4
example/auto_compression/pytorch_yolo_series/README.md
example/auto_compression/pytorch_yolo_series/README.md
+27
-9
example/auto_compression/pytorch_yolo_series/configs/yolov5s_qat_dis.yaml
...pression/pytorch_yolo_series/configs/yolov5s_qat_dis.yaml
+1
-0
example/auto_compression/pytorch_yolo_series/configs/yolov6s_qat_dis.yaml
...pression/pytorch_yolo_series/configs/yolov6s_qat_dis.yaml
+1
-0
example/auto_compression/pytorch_yolo_series/configs/yolov7_qat_dis.yaml
...mpression/pytorch_yolo_series/configs/yolov7_qat_dis.yaml
+1
-0
example/auto_compression/pytorch_yolo_series/configs/yolov7_tiny_qat_dis.yaml
...sion/pytorch_yolo_series/configs/yolov7_tiny_qat_dis.yaml
+1
-0
example/auto_compression/pytorch_yolo_series/cpp_infer/CMakeLists.txt
..._compression/pytorch_yolo_series/cpp_infer/CMakeLists.txt
+0
-0
example/auto_compression/pytorch_yolo_series/cpp_infer/README.md
.../auto_compression/pytorch_yolo_series/cpp_infer/README.md
+30
-7
example/auto_compression/pytorch_yolo_series/cpp_infer/compile.sh
...auto_compression/pytorch_yolo_series/cpp_infer/compile.sh
+0
-0
example/auto_compression/pytorch_yolo_series/cpp_infer/trt_run.cc
...auto_compression/pytorch_yolo_series/cpp_infer/trt_run.cc
+7
-2
example/auto_compression/pytorch_yolo_series/dataset.py
example/auto_compression/pytorch_yolo_series/dataset.py
+4
-2
example/auto_compression/pytorch_yolo_series/eval.py
example/auto_compression/pytorch_yolo_series/eval.py
+4
-5
example/auto_compression/pytorch_yolo_series/images/000000570688.jpg
...o_compression/pytorch_yolo_series/images/000000570688.jpg
+0
-0
example/auto_compression/pytorch_yolo_series/onnx_trt_infer.py
...le/auto_compression/pytorch_yolo_series/onnx_trt_infer.py
+0
-0
example/auto_compression/pytorch_yolo_series/paddle_trt_infer.py
.../auto_compression/pytorch_yolo_series/paddle_trt_infer.py
+10
-3
example/auto_compression/pytorch_yolo_series/post_process.py
example/auto_compression/pytorch_yolo_series/post_process.py
+2
-2
example/auto_compression/pytorch_yolo_series/run.py
example/auto_compression/pytorch_yolo_series/run.py
+9
-8
example/auto_compression/pytorch_yolov5/README.md
example/auto_compression/pytorch_yolov5/README.md
+0
-152
example/auto_compression/pytorch_yolov5/cpp_infer/trt_run.cc
example/auto_compression/pytorch_yolov5/cpp_infer/trt_run.cc
+0
-116
example/auto_compression/pytorch_yolov5/dataset.py
example/auto_compression/pytorch_yolov5/dataset.py
+0
-113
example/auto_compression/pytorch_yolov5/eval.py
example/auto_compression/pytorch_yolov5/eval.py
+0
-100
example/auto_compression/pytorch_yolov5/paddle_trt_infer.py
example/auto_compression/pytorch_yolov5/paddle_trt_infer.py
+0
-319
example/auto_compression/pytorch_yolov5/post_process.py
example/auto_compression/pytorch_yolov5/post_process.py
+0
-231
example/auto_compression/pytorch_yolov5/post_quant.py
example/auto_compression/pytorch_yolov5/post_quant.py
+0
-91
example/auto_compression/pytorch_yolov5/run.py
example/auto_compression/pytorch_yolov5/run.py
+0
-125
example/auto_compression/pytorch_yolov5/yolov5_onnx_trt.py
example/auto_compression/pytorch_yolov5/yolov5_onnx_trt.py
+0
-378
example/auto_compression/pytorch_yolov6/README.md
example/auto_compression/pytorch_yolov6/README.md
+0
-152
example/auto_compression/pytorch_yolov6/cpp_infer/CMakeLists.txt
.../auto_compression/pytorch_yolov6/cpp_infer/CMakeLists.txt
+0
-263
example/auto_compression/pytorch_yolov6/cpp_infer/README.md
example/auto_compression/pytorch_yolov6/cpp_infer/README.md
+0
-50
example/auto_compression/pytorch_yolov6/cpp_infer/compile.sh
example/auto_compression/pytorch_yolov6/cpp_infer/compile.sh
+0
-37
example/auto_compression/pytorch_yolov6/dataset.py
example/auto_compression/pytorch_yolov6/dataset.py
+0
-113
example/auto_compression/pytorch_yolov6/eval.py
example/auto_compression/pytorch_yolov6/eval.py
+0
-100
example/auto_compression/pytorch_yolov6/images/000000570688.jpg
...e/auto_compression/pytorch_yolov6/images/000000570688.jpg
+0
-0
example/auto_compression/pytorch_yolov6/paddle_trt_infer.py
example/auto_compression/pytorch_yolov6/paddle_trt_infer.py
+0
-319
example/auto_compression/pytorch_yolov6/post_process.py
example/auto_compression/pytorch_yolov6/post_process.py
+0
-231
example/auto_compression/pytorch_yolov6/post_quant.py
example/auto_compression/pytorch_yolov6/post_quant.py
+0
-91
example/auto_compression/pytorch_yolov6/run.py
example/auto_compression/pytorch_yolov6/run.py
+0
-125
example/auto_compression/pytorch_yolov6/yolov6_onnx_trt.py
example/auto_compression/pytorch_yolov6/yolov6_onnx_trt.py
+0
-378
example/auto_compression/pytorch_yolov7/cpp_infer/CMakeLists.txt
.../auto_compression/pytorch_yolov7/cpp_infer/CMakeLists.txt
+0
-263
example/auto_compression/pytorch_yolov7/cpp_infer/README.md
example/auto_compression/pytorch_yolov7/cpp_infer/README.md
+0
-51
example/auto_compression/pytorch_yolov7/cpp_infer/compile.sh
example/auto_compression/pytorch_yolov7/cpp_infer/compile.sh
+0
-37
example/auto_compression/pytorch_yolov7/cpp_infer/trt_run.cc
example/auto_compression/pytorch_yolov7/cpp_infer/trt_run.cc
+0
-116
example/auto_compression/pytorch_yolov7/images/000000570688.jpg
...e/auto_compression/pytorch_yolov7/images/000000570688.jpg
+0
-0
example/auto_compression/pytorch_yolov7/post_quant.py
example/auto_compression/pytorch_yolov7/post_quant.py
+0
-91
未找到文件。
README.md
浏览文件 @
dfcc3c2b
...
...
@@ -20,7 +20,7 @@ PaddleSlim是一个专注于深度学习模型压缩的工具库,提供**低
- 支持代码无感知压缩:用户只需提供推理模型文件和数据,既可进行离线量化(PTQ)、量化训练(QAT)、稀疏训练等压缩任务。
- 支持自动策略选择,根据任务特点和部署环境特性:自动搜索合适的离线量化方法,自动搜索最佳的压缩策略组合方式。
- 发布[自然语言处理](example/auto_compression/nlp)、[图像语义分割](example/auto_compression/semantic_segmentation)、[图像目标检测](example/auto_compression/detection)三个方向的自动化压缩示例。
- 发布`X2Paddle`模型自动化压缩方案:[YOLOv5](example/auto_compression/pytorch_yolo
v5)、[YOLOv6](example/auto_compression/pytorch_yolov6)、[YOLOv7](example/auto_compression/pytorch_yolov7
)、[HuggingFace](example/auto_compression/pytorch_huggingface)、[MobileNet](example/auto_compression/tensorflow_mobilenet)。
- 发布`X2Paddle`模型自动化压缩方案:[YOLOv5](example/auto_compression/pytorch_yolo
_series)、[YOLOv6](example/auto_compression/pytorch_yolo_series)、[YOLOv7](example/auto_compression/pytorch_yolo_series
)、[HuggingFace](example/auto_compression/pytorch_huggingface)、[MobileNet](example/auto_compression/tensorflow_mobilenet)。
-
升级量化功能
...
...
example/auto_compression/README.md
浏览文件 @
dfcc3c2b
...
...
@@ -82,15 +82,15 @@ ACT相比传统的模型压缩方法,
|
[
语义分割
](
./semantic_segmentation
)
| UNet | 65.00 | 64.93 | 15.29 | 10.23 |
**1.49**
| NVIDIA Tesla T4 |
| NLP | PP-MiniLM | 72.81 | 72.44 | 128.01 | 17.97 |
**7.12**
| NVIDIA Tesla T4 |
| NLP | ERNIE 3.0-Medium | 73.09 | 72.40 | 29.25(fp16) | 19.61 |
**1.49**
| NVIDIA Tesla T4 |
|
[
目标检测
](
./pytorch_yolo
v5
)
| YOLOv5s
<br/>
(PyTorch) | 37.40 | 36.9 | 5.95 | 1.87 |
**3.18**
| NVIDIA Tesla T4 |
|
[
目标检测
](
./pytorch_yolo
v6
)
| YOLOv6s
<br/>
(PyTorch) | 42.4 | 41.3 | 9.06 | 1.83 |
**4.95**
| NVIDIA Tesla T4 |
|
[
目标检测
](
./pytorch_yolo
v7
)
| YOLOv7
<br/>
(PyTorch) | 51.1 | 50.8 | 26.84 | 4.55 |
**5.89**
| NVIDIA Tesla T4 |
|
[
目标检测
](
./pytorch_yolo
_series
)
| YOLOv5s
<br/>
(PyTorch) | 37.40 | 36.9 | 5.95 | 1.87 |
**3.18**
| NVIDIA Tesla T4 |
|
[
目标检测
](
./pytorch_yolo
_series
)
| YOLOv6s
<br/>
(PyTorch) | 42.4 | 41.3 | 9.06 | 1.83 |
**4.95**
| NVIDIA Tesla T4 |
|
[
目标检测
](
./pytorch_yolo
_series
)
| YOLOv7
<br/>
(PyTorch) | 51.1 | 50.8 | 26.84 | 4.55 |
**5.89**
| NVIDIA Tesla T4 |
|
[
目标检测
](
./detection
)
| PP-YOLOE-s | 43.1 | 42.6 | 6.51 | 2.12 |
**3.07**
| NVIDIA Tesla T4 |
|
[
图像分类
](
./image_classification
)
| MobileNetV1
<br/>
(TensorFlow) | 71.0 | 70.22 | 30.45 | 15.86 |
**1.92**
| SDMM865(骁龙865) |
-
备注:目标检测精度指标为mAP(0.5:0.95)精度测量结果。图像分割精度指标为IoU精度测量结果。
-
更多飞桨模型应用示例及Benchmark可以参考:
[
图像分类
](
./image_classification
)
,
[
目标检测
](
./detection
)
,
[
语义分割
](
./semantic_segmentation
)
,
[
自然语言处理
](
./nlp
)
-
更多其它框架应用示例及Benchmark可以参考:
[
YOLOv5(PyTorch)
](
./pytorch_yolo
v5
)
,
[
YOLOv6(PyTorch)
](
./pytorch_yolov6
)
,
[
YOLOv7(PyTorch)
](
./pytorch_yolov7
)
,
[
HuggingFace(PyTorch)
](
./pytorch_huggingface
)
,
[
MobileNet(TensorFlow)
](
./tensorflow_mobilenet
)
。
-
更多其它框架应用示例及Benchmark可以参考:
[
YOLOv5(PyTorch)
](
./pytorch_yolo
_series
)
,
[
YOLOv6(PyTorch)
](
./pytorch_yolo_series
)
,
[
YOLOv7(PyTorch)
](
./pytorch_yolo_series
)
,
[
HuggingFace(PyTorch)
](
./pytorch_huggingface
)
,
[
MobileNet(TensorFlow)
](
./tensorflow_mobilenet
)
。
## **环境准备**
...
...
example/auto_compression/pytorch_yolo
v7
/README.md
→
example/auto_compression/pytorch_yolo
_series
/README.md
浏览文件 @
dfcc3c2b
# YOLO
v7
自动压缩示例
# YOLO
系列模型
自动压缩示例
目录:
-
[
1.简介
](
#1简介
)
...
...
@@ -14,14 +14,22 @@
## 1. 简介
本示例将以
[
WongKinYiu/yolov7
](
https://github.com/WongKinYiu/yolov7
)
目标检测模型为例,借助
[
X2Paddle
](
https://github.com/PaddlePaddle/X2Paddle
)
的能力,将PyTorch框架模型转换为Paddle框架模型,再使用ACT自动压缩功能进行模型压缩,压缩后的模型可使用Paddle Inference或者导出至ONNX,利用TensorRT部署。
本示例将以
以
[
ultralytics/yolov5
](
https://github.com/ultralytics/yolov5
)
,
[
meituan/YOLOv6
](
https://github.com/meituan/YOLOv6
)
和
[
WongKinYiu/yolov7
](
https://github.com/WongKinYiu/yolov7
)
目标检测模型为例,借助
[
X2Paddle
](
https://github.com/PaddlePaddle/X2Paddle
)
的能力,将PyTorch框架模型转换为Paddle框架模型,再使用ACT自动压缩功能进行模型压缩,压缩后的模型可使用Paddle Inference或者导出至ONNX,利用TensorRT部署。
## 2.Benchmark
| 模型 | 策略 | 输入尺寸 | mAP
<sup>
val
<br>
0.5:0.95 | 模型体积 | 预测时延
<sup><small>
FP32
</small><sup><br><sup>
|预测时延
<sup><small>
FP16
</small><sup><br><sup>
| 预测时延
<sup><small>
INT8
</small><sup><br><sup>
| 配置文件 | Inference模型 |
| :-------- |:-------- |:--------: | :--------: | :---------------------: | :----------------: | :----------------: | :---------------: | :-----------------------------: | :-----------------------------: |
| YOLOv5s | Base模型 | 640
*
640 | 37.4 | 28.1MB | 5.95ms | 2.44ms | - | - |
[
Model
](
https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx
)
|
| YOLOv5s | 离线量化 | 640
*
640 | 36.0 | 7.4MB | - | - | 1.87ms |
[
config
](
https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series
)
| - |
| YOLOv5s | ACT量化训练 | 640
*640 | **36.9** | 7.4MB | - | - | **1.87ms*
*
|
[
config
](
./configs/yolov5s_qat_dis.yaml
)
|
[
Infer Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov5s_quant.tar
)
|
[
ONNX Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov5s_quant.onnx
)
|
| | | | | | | | | |
| YOLOv6s | Base模型 | 640
*
640 | 42.4 | 65.9MB | 9.06ms | 2.90ms | - | - |
[
Model
](
https://paddle-slim-models.bj.bcebos.com/act/yolov6s.onnx
)
|
| YOLOv6s | KL离线量化 | 640
*
640 | 30.3 | 16.8MB | - | - | 1.83ms |
[
config
](
https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series
)
| - |
| YOLOv6s | 量化蒸馏训练 | 640
*640 | **41.3** | 16.8MB | - | - | **1.83ms*
*
|
[
config
](
./configs/yolov6s_qat_dis.yaml
)
|
[
Infer Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_quant.tar
)
|
[
ONNX Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_quant.onnx
)
|
| | | | | | | | | |
| YOLOv7 | Base模型 | 640
*
640 | 51.1 | 141MB | 26.84ms | 7.44ms | - | - |
[
Model
](
https://paddle-slim-models.bj.bcebos.com/act/yolov7.onnx
)
|
| YOLOv7 | 离线量化 | 640
*
640 | 50.2 | 36MB | - | - | 4.55ms |
-
| - |
| YOLOv7 | 离线量化 | 640
*
640 | 50.2 | 36MB | - | - | 4.55ms |
[
config
](
https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series
)
| - |
| YOLOv7 | ACT量化训练 | 640
*640 | **50.9** | 36MB | - | - | **4.55ms*
*
|
[
config
](
./configs/yolov7_qat_dis.yaml
)
|
[
Infer Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov7_quant.tar
)
|
[
ONNX Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov7_quant.onnx
)
|
| | | | | | | | | |
| YOLOv7-Tiny | Base模型 | 640
*
640 | 37.3 | 24MB | 5.06ms | 2.32ms | - | - |
[
Model
](
https://paddle-slim-models.bj.bcebos.com/act/yolov7-tiny.onnx
)
|
...
...
@@ -80,7 +88,18 @@ dataset/coco/
(1)准备ONNX模型:
可通过
[
WongKinYiu/yolov7
](
https://github.com/WongKinYiu/yolov7
)
的导出脚本来准备ONNX模型,具体步骤如下:
-
YOLOv5:
可通过
[
ultralytics/yolov5
](
https://github.com/ultralytics/yolov5
)
官方的
[
导出教程
](
https://github.com/ultralytics/yolov5/issues/251
)
来准备ONNX模型。也可以下载准备好的
[
yolov5s.onnx
](
https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx
)
。
```
shell
python export.py
--weights
yolov5s.pt
--include
onnx
```
-
YOLOv6:
可通过
[
meituan/YOLOv6
](
https://github.com/meituan/YOLOv6
)
官方的
[
导出教程
](
https://github.com/meituan/YOLOv6/blob/main/deploy/ONNX/README.md
)
来准备ONNX模型。也可以下载已经准备好的
[
yolov6s.onnx
](
https://paddle-slim-models.bj.bcebos.com/act/yolov6s.onnx
)
。
-
YOLOv7: 可通过
[
WongKinYiu/yolov7
](
https://github.com/WongKinYiu/yolov7
)
的导出脚本来准备ONNX模型,具体步骤如下:
```
shell
git clone https://github.com/WongKinYiu/yolov7.git
python export.py
--weights
yolov7-tiny.pt
--grid
...
...
@@ -90,7 +109,9 @@ python export.py --weights yolov7-tiny.pt --grid
#### 3.4 自动压缩并产出模型
蒸馏量化自动压缩示例通过run.py脚本启动,会使用接口
```paddleslim.auto_compression.AutoCompression```
对模型进行自动压缩。配置config文件中模型路径、蒸馏、量化、和训练等部分的参数,配置完成后便可对模型进行量化和蒸馏。具体运行命令为:
蒸馏量化自动压缩示例通过run.py脚本启动,会使用接口
```paddleslim.auto_compression.AutoCompression```
对模型进行自动压缩。配置config文件中模型路径、蒸馏、量化、和训练等部分的参数,配置完成后便可对模型进行量化和蒸馏。
本示例启动自动压缩以YOLOv7-Tiny为例,如果想要更换模型,可修改
`--config_path`
路径即可,具体运行命令为:
-
单卡训练:
```
...
...
@@ -160,7 +181,4 @@ python paddle_trt_infer.py --model_path=output --image_file=images/000000570688.
## 5.FAQ
-
如果想测试离线量化模型精度,可执行:
```
shell
python post_quant.py
--config_path
=
./configs/yolov7_qat_dis.yaml
```
-
如果想对模型进行离线量化,可进入
[
YOLO系列模型离线量化示例
](
https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series
)
中进行实验。
example/auto_compression/pytorch_yolo
v5
/configs/yolov5s_qat_dis.yaml
→
example/auto_compression/pytorch_yolo
_series
/configs/yolov5s_qat_dis.yaml
浏览文件 @
dfcc3c2b
...
...
@@ -6,6 +6,7 @@ Global:
train_anno_path
:
annotations/instances_train2017.json
val_anno_path
:
annotations/instances_val2017.json
Evaluation
:
True
arch
:
YOLOv5
Distillation
:
alpha
:
1.0
...
...
example/auto_compression/pytorch_yolo
v6
/configs/yolov6s_qat_dis.yaml
→
example/auto_compression/pytorch_yolo
_series
/configs/yolov6s_qat_dis.yaml
浏览文件 @
dfcc3c2b
...
...
@@ -6,6 +6,7 @@ Global:
train_anno_path
:
annotations/instances_train2017.json
val_anno_path
:
annotations/instances_val2017.json
Evaluation
:
True
arch
:
YOLOv6
Distillation
:
alpha
:
1.0
...
...
example/auto_compression/pytorch_yolo
v7
/configs/yolov7_qat_dis.yaml
→
example/auto_compression/pytorch_yolo
_series
/configs/yolov7_qat_dis.yaml
浏览文件 @
dfcc3c2b
...
...
@@ -6,6 +6,7 @@ Global:
train_anno_path
:
annotations/instances_train2017.json
val_anno_path
:
annotations/instances_val2017.json
Evaluation
:
True
arch
:
YOLOv7
Distillation
:
alpha
:
1.0
...
...
example/auto_compression/pytorch_yolo
v7
/configs/yolov7_tiny_qat_dis.yaml
→
example/auto_compression/pytorch_yolo
_series
/configs/yolov7_tiny_qat_dis.yaml
浏览文件 @
dfcc3c2b
...
...
@@ -6,6 +6,7 @@ Global:
train_anno_path
:
annotations/instances_train2017.json
val_anno_path
:
annotations/instances_val2017.json
Evaluation
:
True
arch
:
YOLOv7
Distillation
:
alpha
:
1.0
...
...
example/auto_compression/pytorch_yolo
v5
/cpp_infer/CMakeLists.txt
→
example/auto_compression/pytorch_yolo
_series
/cpp_infer/CMakeLists.txt
浏览文件 @
dfcc3c2b
文件已移动
example/auto_compression/pytorch_yolo
v5
/cpp_infer/README.md
→
example/auto_compression/pytorch_yolo
_series
/cpp_infer/README.md
浏览文件 @
dfcc3c2b
# YOLOv
5
TensorRT Benchmark测试(Linux)
# YOLOv
7
TensorRT Benchmark测试(Linux)
## 环境准备
...
...
@@ -22,21 +22,37 @@ CUDA_LIB=/usr/local/cuda/lib64
TENSORRT_ROOT
=
/root/auto_compress/trt/trt8.4/
```
## Paddle
T
ensorRT测试
## Paddle
t
ensorRT测试
-
FP32
-
YOLOv5
```
# FP32
./build/trt_run --model_file yolov5s_infer/model.pdmodel --params_file yolov5s_infer/model.pdiparams --run_mode=trt_fp32
# FP16
./build/trt_run --model_file yolov5s_infer/model.pdmodel --params_file yolov5s_infer/model.pdiparams --run_mode=trt_fp16
# INT8
./build/trt_run --model_file yolov5s_quant/model.pdmodel --params_file yolov5s_quant/model.pdiparams --run_mode=trt_int8
```
-
FP1
6
-
YOLOv
6
```
./build/trt_run --model_file yolov5s_infer/model.pdmodel --params_file yolov5s_infer/model.pdiparams --run_mode=trt_fp16
# FP32
./build/trt_run --arch=YOLOv6 --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp32
# FP16
./build/trt_run --arch=YOLOv6 --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp16
# INT8
./build/trt_run --arch=YOLOv6 --model_file yolov6s_quant/model.pdmodel --params_file yolov6s_quant/model.pdiparams --run_mode=trt_int8
```
-
INT8
-
YOLOv7
```
./build/trt_run --model_file yolov5s_quant/model.pdmodel --params_file yolov5s_quant/model.pdiparams --run_mode=trt_int8
# FP32
./build/trt_run --model_file yolov7_infer/model.pdmodel --params_file yolov7_infer/model.pdiparams --run_mode=trt_fp32
# FP16
./build/trt_run --model_file yolov7_infer/model.pdmodel --params_file yolov7_infer/model.pdiparams --run_mode=trt_fp16
# INT8
./build/trt_run --model_file yolov7_quant/model.pdmodel --params_file yolov7_quant/model.pdiparams --run_mode=trt_int8
```
## 原生TensorRT测试
...
...
@@ -49,6 +65,7 @@ trtexec --onnx=yolov5s.onnx --workspace=1024 --avgRuns=1000 --inputIOFormats=fp1
# INT8
trtexec
--onnx
=
yolov5s.onnx
--workspace
=
1024
--avgRuns
=
1000
--inputIOFormats
=
fp16:chw
--outputIOFormats
=
fp16:chw
--int8
```
-
注:可把--onnx=yolov5s.onnx替换成yolov6s.onnx和yolov7.onnx模型
## 性能对比
...
...
@@ -56,6 +73,12 @@ trtexec --onnx=yolov5s.onnx --workspace=1024 --avgRuns=1000 --inputIOFormats=fp1
| :--------: | :--------: |:-------- |:--------: | :---------------------: |
| Paddle TensorRT | yolov5s | 5.95ms | 2.44ms | 1.87ms |
| TensorRT | yolov5s | 6.16ms | 2.58ms | 2.07ms |
| | | | | |
| Paddle TensorRT | YOLOv6s | 9.06ms | 2.90ms | 1.83ms |
| TensorRT | YOLOv6s | 8.59ms | 2.83ms | 1.87ms |
| | | | | |
| Paddle TensorRT | YOLOv7 | 26.84ms | 7.44ms | 4.55ms |
| TensorRT | YOLOv7 | 28.25ms | 7.23ms | 4.67ms |
环境:
-
Tesla T4,TensorRT 8.4.1,CUDA 11.2
...
...
example/auto_compression/pytorch_yolo
v5
/cpp_infer/compile.sh
→
example/auto_compression/pytorch_yolo
_series
/cpp_infer/compile.sh
浏览文件 @
dfcc3c2b
文件已移动
example/auto_compression/pytorch_yolo
v6
/cpp_infer/trt_run.cc
→
example/auto_compression/pytorch_yolo
_series
/cpp_infer/trt_run.cc
浏览文件 @
dfcc3c2b
...
...
@@ -19,6 +19,7 @@ using phi::dtype::float16;
DEFINE_string
(
model_dir
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
model_file
,
""
,
"Path of the inference model file."
);
DEFINE_string
(
params_file
,
""
,
"Path of the inference params file."
);
DEFINE_string
(
arch
,
"YOLOv5"
,
"Architectures name, can be: YOLOv5, YOLOv6, YOLOv7."
);
DEFINE_string
(
run_mode
,
"trt_fp32"
,
"run_mode which can be: trt_fp32, trt_fp16 and trt_int8"
);
DEFINE_int32
(
batch_size
,
1
,
"Batch size."
);
DEFINE_int32
(
gpu_id
,
0
,
"GPU card ID num."
);
...
...
@@ -106,11 +107,15 @@ int main(int argc, char *argv[]) {
using
dtype
=
float16
;
std
::
vector
<
dtype
>
input_data
(
FLAGS_batch_size
*
3
*
640
*
640
,
dtype
(
1.0
));
int
out_box_shape
=
25200
;
if
(
FLAGS_arch
==
"YOLOv6"
){
out_box_shape
=
8400
;
}
dtype
*
out_data
;
int
out_data_size
=
FLAGS_batch_size
*
8400
*
85
;
int
out_data_size
=
FLAGS_batch_size
*
out_box_shape
*
85
;
cudaHostAlloc
((
void
**
)
&
out_data
,
sizeof
(
float
)
*
out_data_size
,
cudaHostAllocMapped
);
std
::
vector
<
int
>
out_shape
{
FLAGS_batch_size
,
1
,
8400
,
85
};
std
::
vector
<
int
>
out_shape
{
FLAGS_batch_size
,
1
,
out_box_shape
,
85
};
run
<
dtype
>
(
predictor
.
get
(),
input_data
,
input_shape
,
out_data
,
out_shape
);
return
0
;
}
example/auto_compression/pytorch_yolo
v7
/dataset.py
→
example/auto_compression/pytorch_yolo
_series
/dataset.py
浏览文件 @
dfcc3c2b
...
...
@@ -10,10 +10,12 @@ class COCOValDataset(paddle.io.Dataset):
dataset_dir
=
None
,
image_dir
=
None
,
anno_path
=
None
,
img_size
=
[
640
,
640
]):
img_size
=
[
640
,
640
],
input_name
=
'x2paddle_images'
):
self
.
dataset_dir
=
dataset_dir
self
.
image_dir
=
image_dir
self
.
img_size
=
img_size
self
.
input_name
=
input_name
self
.
ann_file
=
os
.
path
.
join
(
dataset_dir
,
anno_path
)
self
.
coco
=
COCO
(
self
.
ann_file
)
ori_ids
=
list
(
sorted
(
self
.
coco
.
imgs
.
keys
()))
...
...
@@ -110,4 +112,4 @@ class COCOTrainDataset(COCOValDataset):
img_id
=
self
.
ids
[
idx
]
img
=
self
.
_get_img_data_from_img_id
(
img_id
)
img
,
scale_factor
=
self
.
image_preprocess
(
img
,
self
.
img_size
)
return
{
'x2paddle_images'
:
img
}
return
{
self
.
input_name
:
img
}
example/auto_compression/pytorch_yolo
v7
/eval.py
→
example/auto_compression/pytorch_yolo
_series
/eval.py
浏览文件 @
dfcc3c2b
...
...
@@ -18,9 +18,9 @@ import numpy as np
import
argparse
from
tqdm
import
tqdm
import
paddle
from
paddleslim.common
import
load_config
as
load_slim_config
from
paddleslim.common
import
load_config
from
paddleslim.common
import
load_inference_model
from
post_process
import
YOLO
v7
PostProcess
,
coco_metric
from
post_process
import
YOLOPostProcess
,
coco_metric
from
dataset
import
COCOValDataset
...
...
@@ -60,8 +60,7 @@ def eval():
feed
=
{
feed_target_names
[
0
]:
data_all
[
'image'
]},
fetch_list
=
fetch_targets
,
return_numpy
=
False
)
res
=
{}
postprocess
=
YOLOv7PostProcess
(
postprocess
=
YOLOPostProcess
(
score_threshold
=
0.001
,
nms_threshold
=
0.65
,
multi_label
=
True
)
res
=
postprocess
(
np
.
array
(
outs
[
0
]),
data_all
[
'scale_factor'
])
bboxes_list
.
append
(
res
[
'bbox'
])
...
...
@@ -74,7 +73,7 @@ def eval():
def
main
():
global
global_config
all_config
=
load_
slim_
config
(
FLAGS
.
config_path
)
all_config
=
load_config
(
FLAGS
.
config_path
)
global_config
=
all_config
[
"Global"
]
global
val_loader
...
...
example/auto_compression/pytorch_yolo
v5
/images/000000570688.jpg
→
example/auto_compression/pytorch_yolo
_series
/images/000000570688.jpg
浏览文件 @
dfcc3c2b
文件已移动
example/auto_compression/pytorch_yolo
v7
/onnx_trt_infer.py
→
example/auto_compression/pytorch_yolo
_series
/onnx_trt_infer.py
浏览文件 @
dfcc3c2b
文件已移动
example/auto_compression/pytorch_yolo
v7
/paddle_trt_infer.py
→
example/auto_compression/pytorch_yolo
_series
/paddle_trt_infer.py
浏览文件 @
dfcc3c2b
...
...
@@ -241,9 +241,13 @@ def predict_image(predictor,
image_shape
=
[
640
,
640
],
warmup
=
1
,
repeats
=
1
,
threshold
=
0.5
):
threshold
=
0.5
,
arch
=
'YOLOv5'
):
img
,
scale_factor
=
image_preprocess
(
image_file
,
image_shape
)
inputs
[
'x2paddle_images'
]
=
img
if
arch
==
'YOLOv6'
:
inputs
[
'x2paddle_image_arrays'
]
=
img
else
:
inputs
[
'x2paddle_images'
]
=
img
input_names
=
predictor
.
get_input_names
()
for
i
in
range
(
len
(
input_names
)):
input_tensor
=
predictor
.
get_input_handle
(
input_names
[
i
])
...
...
@@ -303,6 +307,8 @@ if __name__ == '__main__':
default
=
'GPU'
,
help
=
"Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU"
)
parser
.
add_argument
(
'--arch'
,
type
=
str
,
default
=
'YOLOv5'
,
help
=
"architectures name."
)
parser
.
add_argument
(
'--img_shape'
,
type
=
int
,
default
=
640
,
help
=
"input_size"
)
args
=
parser
.
parse_args
()
...
...
@@ -316,4 +322,5 @@ if __name__ == '__main__':
args
.
image_file
,
image_shape
=
[
args
.
img_shape
,
args
.
img_shape
],
warmup
=
warmup
,
repeats
=
repeats
)
repeats
=
repeats
,
arch
=
args
.
arch
)
example/auto_compression/pytorch_yolo
v7
/post_process.py
→
example/auto_compression/pytorch_yolo
_series
/post_process.py
浏览文件 @
dfcc3c2b
...
...
@@ -70,9 +70,9 @@ def nms(boxes, scores, iou_threshold):
return
keep
class
YOLO
v7
PostProcess
(
object
):
class
YOLOPostProcess
(
object
):
"""
Post process of YOLO
v6
network.
Post process of YOLO
-series
network.
args:
score_threshold(float): Threshold to filter out bounding boxes with low
confidence score. If not provided, consider all boxes.
...
...
example/auto_compression/pytorch_yolo
v7
/run.py
→
example/auto_compression/pytorch_yolo
_series
/run.py
浏览文件 @
dfcc3c2b
...
...
@@ -18,10 +18,10 @@ import numpy as np
import
argparse
from
tqdm
import
tqdm
import
paddle
from
paddleslim.common
import
load_config
as
load_slim_config
from
paddleslim.common
import
load_config
from
paddleslim.auto_compression
import
AutoCompression
from
dataset
import
COCOValDataset
,
COCOTrainDataset
from
post_process
import
YOLO
v7
PostProcess
,
coco_metric
from
post_process
import
YOLOPostProcess
,
coco_metric
def
argsparser
():
...
...
@@ -61,7 +61,7 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
fetch_list
=
test_fetch_list
,
return_numpy
=
False
)
res
=
{}
postprocess
=
YOLO
v7
PostProcess
(
postprocess
=
YOLOPostProcess
(
score_threshold
=
0.001
,
nms_threshold
=
0.65
,
multi_label
=
True
)
res
=
postprocess
(
np
.
array
(
outs
[
0
]),
data_all
[
'scale_factor'
])
bboxes_list
.
append
(
res
[
'bbox'
])
...
...
@@ -74,15 +74,16 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
def
main
():
global
global_config
all_config
=
load_slim_config
(
FLAGS
.
config_path
)
assert
"Global"
in
all_config
,
"Key 'Global' not found in config file.
\n
{}"
.
format
(
all_config
)
all_config
=
load_config
(
FLAGS
.
config_path
)
assert
"Global"
in
all_config
,
f
"Key 'Global' not found in config file.
\n
{
all_config
}
"
global_config
=
all_config
[
"Global"
]
input_name
=
'x2paddle_image_arrays'
if
global_config
[
'arch'
]
==
'YOLOv6'
else
'x2paddle_images'
dataset
=
COCOTrainDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'train_image_dir'
],
anno_path
=
global_config
[
'train_anno_path'
])
anno_path
=
global_config
[
'train_anno_path'
],
input_name
=
input_name
)
train_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
0
)
...
...
example/auto_compression/pytorch_yolov5/README.md
已删除
100644 → 0
浏览文件 @
407d47b3
# YOLOv5目标检测模型自动压缩示例
目录:
-
[
1.简介
](
#1简介
)
-
[
2.Benchmark
](
#2Benchmark
)
-
[
3.开始自动压缩
](
#自动压缩流程
)
-
[
3.1 环境准备
](
#31-准备环境
)
-
[
3.2 准备数据集
](
#32-准备数据集
)
-
[
3.3 准备预测模型
](
#33-准备预测模型
)
-
[
3.4 测试模型精度
](
#34-测试模型精度
)
-
[
3.5 自动压缩并产出模型
](
#35-自动压缩并产出模型
)
-
[
4.预测部署
](
#4预测部署
)
-
[
5.FAQ
](
5FAQ
)
## 1. 简介
飞桨模型转换工具
[
X2Paddle
](
https://github.com/PaddlePaddle/X2Paddle
)
支持将
```Caffe/TensorFlow/ONNX/PyTorch```
的模型一键转为飞桨(PaddlePaddle)的预测模型。借助X2Paddle的能力,各种框架的推理模型可以很方便的使用PaddleSlim的自动化压缩功能。
本示例将以
[
ultralytics/yolov5
](
https://github.com/ultralytics/yolov5
)
目标检测模型为例,将PyTorch框架模型转换为Paddle框架模型,再使用ACT自动压缩功能进行自动压缩。本示例使用的自动压缩策略为量化训练。
## 2.Benchmark
| 模型 | 策略 | 输入尺寸 | mAP
<sup>
val
<br>
0.5:0.95 | 预测时延
<sup><small>
FP32
</small><sup><br><sup>
(ms) |预测时延
<sup><small>
FP16
</small><sup><br><sup>
(ms) | 预测时延
<sup><small>
INT8
</small><sup><br><sup>
(ms) | 配置文件 | Inference模型 |
| :-------- |:-------- |:--------: | :---------------------: | :----------------: | :----------------: | :---------------: | :-----------------------------: | :-----------------------------: |
| YOLOv5s | Base模型 | 640
*
640 | 37.4 | 5.95ms | 2.44ms | - | - |
[
Model
](
https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx
)
|
| YOLOv5s | KL离线量化 | 640
*
640 | 36.0 | - | - | 1.87ms | - | - |
| YOLOv5s | 量化蒸馏训练 | 640
*640 | **36.9** | - | - | **1.87ms*
*
|
[
config
](
./configs/yolov5s_qat_dis.yaml
)
|
[
Infer Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov5s_quant.tar
)
|
[
ONNX Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov5s_quant.onnx
)
|
说明:
-
mAP的指标均在COCO val2017数据集中评测得到。
-
YOLOv5s模型在Tesla T4的GPU环境下开启TensorRT 8.4.1,batch_size=1, 测试脚本是
[
cpp_infer
](
./cpp_infer
)
。
## 3. 自动压缩流程
#### 3.1 准备环境
-
PaddlePaddle >= 2.3 (可从
[
Paddle官网
](
https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html
)
下载安装)
-
PaddleSlim >= 2.3
-
PaddleDet >= 2.4
-
[
X2Paddle
](
https://github.com/PaddlePaddle/X2Paddle
)
>= 1.3.6
-
opencv-python
(1)安装paddlepaddle:
```
shell
# CPU
pip
install
paddlepaddle
# GPU
pip
install
paddlepaddle-gpu
```
(2)安装paddleslim:
```
shell
pip
install
paddleslim
```
(3)安装paddledet:
```
shell
pip
install
paddledet
```
注:安装PaddleDet的目的是为了直接使用PaddleDetection中的Dataloader组件。
#### 3.2 准备数据集
本案例默认以COCO数据进行自动压缩实验,并且依赖PaddleDetection中数据读取模块,如果自定义COCO数据,或者其他格式数据,请参考
[
PaddleDetection数据准备文档
](
https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/docs/tutorials/PrepareDataSet.md
)
来准备数据。
如果已经准备好数据集,请直接修改[./configs/yolov6_reader.yml]中
`EvalDataset`
的
`dataset_dir`
字段为自己数据集路径即可。
#### 3.3 准备ONNX预测模型
可通过
[
ultralytics/yolov5
](
https://github.com/ultralytics/yolov5
)
官方的
[
导出教程
](
https://github.com/ultralytics/yolov5/issues/251
)
来准备ONNX模型。也可以下载准备好的
[
yolov5s.onnx
](
https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx
)
。
```
shell
python export.py
--weights
yolov5s.pt
--include
onnx
```
#### 3.4 自动压缩并产出模型
蒸馏量化自动压缩示例通过run.py脚本启动,会使用接口
```paddleslim.auto_compression.AutoCompression```
对模型进行自动压缩。配置config文件中模型路径、蒸馏、量化、和训练等部分的参数,配置完成后便可对模型进行量化和蒸馏。具体运行命令为:
-
单卡训练:
```
export CUDA_VISIBLE_DEVICES=0
python run.py --config_path=./configs/yolov5s_qat_dis.yaml --save_dir='./output/'
```
-
多卡训练:
```
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 run.py \
--config_path=./configs/yolov5s_qat_dis.yaml --save_dir='./output/'
```
#### 3.5 测试模型精度
使用eval.py脚本得到模型的mAP:
```
export CUDA_VISIBLE_DEVICES=0
python eval.py --config_path=./configs/yolov5s_qat_dis.yaml
```
**注意**
:如果要测试量化后的模型,模型路径需要在配置文件中
`model_dir`
字段下进行修改指定。
## 4.预测部署
#### 导出至ONNX使用TensorRT部署
-
首先安装Paddle2onnx:
```
shell
pip
install
paddle2onnx
==
1.0.0rc3
```
-
然后将量化模型导出至ONNX:
```
shell
paddle2onnx
--model_dir
output/
\
--model_filename
model.pdmodel
\
--params_filename
model.pdiparams
\
--opset_version
13
\
--enable_onnx_checker
True
\
--save_file
yolov5s_quant.onnx
\
--deploy_backend
tensorrt
```
-
进行测试:
```
shell
python yolov5_onnx_trt.py
--model_path
=
yolov5s_quant.onnx
--image_file
=
images/000000570688.jpg
--precision
=
int8
```
#### Paddle-TensorRT部署
-
C++部署:
进入
[
cpp_infer
](
./cpp_infer
)
文件夹内,请按照
[
C++ TensorRT Benchmark测试教程
](
./cpp_infer/README.md
)
进行准备环境及编译,然后开始测试:
```
shell
# 编译
bash complie.sh
# 执行
./build/trt_run
--model_file
yolov5s_quant/model.pdmodel
--params_file
yolov5s_quant/model.pdiparams
--run_mode
=
trt_int8
```
-
Python部署:
首先安装带有TensorRT的
[
Paddle安装包
](
https://www.paddlepaddle.org.cn/inference/v2.3/user_guides/download_lib.html#python
)
。
然后使用
[
paddle_trt_infer.py
](
./paddle_trt_infer.py
)
进行部署:
```
shell
python paddle_trt_infer.py
--model_path
=
output
--image_file
=
images/000000570688.jpg
--benchmark
=
True
--run_mode
=
trt_int8
```
## 5.FAQ
-
如果想测试离线量化模型精度,可执行:
```
shell
python post_quant.py
--config_path
=
./configs/yolov5s_qat_dis.yaml
```
example/auto_compression/pytorch_yolov5/cpp_infer/trt_run.cc
已删除
100644 → 0
浏览文件 @
407d47b3
#include <chrono>
#include <iostream>
#include <memory>
#include <numeric>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <cuda_runtime.h>
#include "paddle/include/paddle_inference_api.h"
#include "paddle/include/experimental/phi/common/float16.h"
using
paddle_infer
::
Config
;
using
paddle_infer
::
Predictor
;
using
paddle_infer
::
CreatePredictor
;
using
paddle_infer
::
PrecisionType
;
using
phi
::
dtype
::
float16
;
DEFINE_string
(
model_dir
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
model_file
,
""
,
"Path of the inference model file."
);
DEFINE_string
(
params_file
,
""
,
"Path of the inference params file."
);
DEFINE_string
(
run_mode
,
"trt_fp32"
,
"run_mode which can be: trt_fp32, trt_fp16 and trt_int8"
);
DEFINE_int32
(
batch_size
,
1
,
"Batch size."
);
DEFINE_int32
(
gpu_id
,
0
,
"GPU card ID num."
);
DEFINE_int32
(
trt_min_subgraph_size
,
3
,
"tensorrt min_subgraph_size"
);
DEFINE_int32
(
warmup
,
50
,
"warmup"
);
DEFINE_int32
(
repeats
,
1000
,
"repeats"
);
using
Time
=
decltype
(
std
::
chrono
::
high_resolution_clock
::
now
());
Time
time
()
{
return
std
::
chrono
::
high_resolution_clock
::
now
();
};
double
time_diff
(
Time
t1
,
Time
t2
)
{
typedef
std
::
chrono
::
microseconds
ms
;
auto
diff
=
t2
-
t1
;
ms
counter
=
std
::
chrono
::
duration_cast
<
ms
>
(
diff
);
return
counter
.
count
()
/
1000.0
;
}
std
::
shared_ptr
<
Predictor
>
InitPredictor
()
{
Config
config
;
std
::
string
model_path
;
if
(
FLAGS_model_dir
!=
""
)
{
config
.
SetModel
(
FLAGS_model_dir
);
model_path
=
FLAGS_model_dir
.
substr
(
0
,
FLAGS_model_dir
.
find_last_of
(
"/"
));
}
else
{
config
.
SetModel
(
FLAGS_model_file
,
FLAGS_params_file
);
model_path
=
FLAGS_model_file
.
substr
(
0
,
FLAGS_model_file
.
find_last_of
(
"/"
));
}
// enable tune
std
::
cout
<<
"model_path: "
<<
model_path
<<
std
::
endl
;
config
.
EnableUseGpu
(
256
,
FLAGS_gpu_id
);
if
(
FLAGS_run_mode
==
"trt_fp32"
)
{
config
.
EnableTensorRtEngine
(
1
<<
30
,
FLAGS_batch_size
,
FLAGS_trt_min_subgraph_size
,
PrecisionType
::
kFloat32
,
false
,
false
);
}
else
if
(
FLAGS_run_mode
==
"trt_fp16"
)
{
config
.
EnableTensorRtEngine
(
1
<<
30
,
FLAGS_batch_size
,
FLAGS_trt_min_subgraph_size
,
PrecisionType
::
kHalf
,
false
,
false
);
}
else
if
(
FLAGS_run_mode
==
"trt_int8"
)
{
config
.
EnableTensorRtEngine
(
1
<<
30
,
FLAGS_batch_size
,
FLAGS_trt_min_subgraph_size
,
PrecisionType
::
kInt8
,
false
,
false
);
}
config
.
EnableMemoryOptim
();
config
.
SwitchIrOptim
(
true
);
return
CreatePredictor
(
config
);
}
template
<
typename
type
>
void
run
(
Predictor
*
predictor
,
const
std
::
vector
<
type
>
&
input
,
const
std
::
vector
<
int
>
&
input_shape
,
type
*
out_data
,
std
::
vector
<
int
>
out_shape
)
{
// prepare input
int
input_num
=
std
::
accumulate
(
input_shape
.
begin
(),
input_shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
auto
input_names
=
predictor
->
GetInputNames
();
auto
input_t
=
predictor
->
GetInputHandle
(
input_names
[
0
]);
input_t
->
Reshape
(
input_shape
);
input_t
->
CopyFromCpu
(
input
.
data
());
for
(
int
i
=
0
;
i
<
FLAGS_warmup
;
++
i
)
CHECK
(
predictor
->
Run
());
auto
st
=
time
();
for
(
int
i
=
0
;
i
<
FLAGS_repeats
;
++
i
)
{
auto
input_names
=
predictor
->
GetInputNames
();
auto
input_t
=
predictor
->
GetInputHandle
(
input_names
[
0
]);
input_t
->
Reshape
(
input_shape
);
input_t
->
CopyFromCpu
(
input
.
data
());
CHECK
(
predictor
->
Run
());
auto
output_names
=
predictor
->
GetOutputNames
();
auto
output_t
=
predictor
->
GetOutputHandle
(
output_names
[
0
]);
std
::
vector
<
int
>
output_shape
=
output_t
->
shape
();
output_t
->
ShareExternalData
<
type
>
(
out_data
,
out_shape
,
paddle_infer
::
PlaceType
::
kGPU
);
}
LOG
(
INFO
)
<<
"["
<<
FLAGS_run_mode
<<
" bs-"
<<
FLAGS_batch_size
<<
" ] run avg time is "
<<
time_diff
(
st
,
time
())
/
FLAGS_repeats
<<
" ms"
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
google
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
auto
predictor
=
InitPredictor
();
std
::
vector
<
int
>
input_shape
=
{
FLAGS_batch_size
,
3
,
640
,
640
};
// float16
using
dtype
=
float16
;
std
::
vector
<
dtype
>
input_data
(
FLAGS_batch_size
*
3
*
640
*
640
,
dtype
(
1.0
));
dtype
*
out_data
;
int
out_data_size
=
FLAGS_batch_size
*
25200
*
85
;
cudaHostAlloc
((
void
**
)
&
out_data
,
sizeof
(
float
)
*
out_data_size
,
cudaHostAllocMapped
);
std
::
vector
<
int
>
out_shape
{
FLAGS_batch_size
,
1
,
25200
,
85
};
run
<
dtype
>
(
predictor
.
get
(),
input_data
,
input_shape
,
out_data
,
out_shape
);
return
0
;
}
example/auto_compression/pytorch_yolov5/dataset.py
已删除
100644 → 0
浏览文件 @
407d47b3
from
pycocotools.coco
import
COCO
import
cv2
import
os
import
numpy
as
np
import
paddle
class
COCOValDataset
(
paddle
.
io
.
Dataset
):
def
__init__
(
self
,
dataset_dir
=
None
,
image_dir
=
None
,
anno_path
=
None
,
img_size
=
[
640
,
640
]):
self
.
dataset_dir
=
dataset_dir
self
.
image_dir
=
image_dir
self
.
img_size
=
img_size
self
.
ann_file
=
os
.
path
.
join
(
dataset_dir
,
anno_path
)
self
.
coco
=
COCO
(
self
.
ann_file
)
ori_ids
=
list
(
sorted
(
self
.
coco
.
imgs
.
keys
()))
# check gt bbox
clean_ids
=
[]
for
idx
in
ori_ids
:
ins_anno_ids
=
self
.
coco
.
getAnnIds
(
imgIds
=
[
idx
],
iscrowd
=
False
)
instances
=
self
.
coco
.
loadAnns
(
ins_anno_ids
)
num_bbox
=
0
for
inst
in
instances
:
if
inst
.
get
(
'ignore'
,
False
):
continue
if
'bbox'
not
in
inst
.
keys
():
continue
elif
not
any
(
np
.
array
(
inst
[
'bbox'
])):
continue
else
:
num_bbox
+=
1
if
num_bbox
>
0
:
clean_ids
.
append
(
idx
)
self
.
ids
=
clean_ids
def
__getitem__
(
self
,
idx
):
img_id
=
self
.
ids
[
idx
]
img
=
self
.
_get_img_data_from_img_id
(
img_id
)
img
,
scale_factor
=
self
.
image_preprocess
(
img
,
self
.
img_size
)
return
{
'image'
:
img
,
'im_id'
:
np
.
array
([
img_id
]),
'scale_factor'
:
scale_factor
}
def
__len__
(
self
):
return
len
(
self
.
ids
)
def
_get_img_data_from_img_id
(
self
,
img_id
):
img_info
=
self
.
coco
.
loadImgs
(
img_id
)[
0
]
img_path
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
image_dir
,
img_info
[
'file_name'
])
img
=
cv2
.
imread
(
img_path
)
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
)
return
img
def
_generate_scale
(
self
,
im
,
target_shape
,
keep_ratio
=
True
):
"""
Args:
im (np.ndarray): image (np.ndarray)
Returns:
im_scale_x: the resize ratio of X
im_scale_y: the resize ratio of Y
"""
origin_shape
=
im
.
shape
[:
2
]
if
keep_ratio
:
im_size_min
=
np
.
min
(
origin_shape
)
im_size_max
=
np
.
max
(
origin_shape
)
target_size_min
=
np
.
min
(
target_shape
)
target_size_max
=
np
.
max
(
target_shape
)
im_scale
=
float
(
target_size_min
)
/
float
(
im_size_min
)
if
np
.
round
(
im_scale
*
im_size_max
)
>
target_size_max
:
im_scale
=
float
(
target_size_max
)
/
float
(
im_size_max
)
im_scale_x
=
im_scale
im_scale_y
=
im_scale
else
:
resize_h
,
resize_w
=
target_shape
im_scale_y
=
resize_h
/
float
(
origin_shape
[
0
])
im_scale_x
=
resize_w
/
float
(
origin_shape
[
1
])
return
im_scale_y
,
im_scale_x
def
image_preprocess
(
self
,
img
,
target_shape
):
# Resize image
im_scale_y
,
im_scale_x
=
self
.
_generate_scale
(
img
,
target_shape
)
img
=
cv2
.
resize
(
img
,
None
,
None
,
fx
=
im_scale_x
,
fy
=
im_scale_y
,
interpolation
=
cv2
.
INTER_LINEAR
)
# Pad
im_h
,
im_w
=
img
.
shape
[:
2
]
h
,
w
=
target_shape
[:]
if
h
!=
im_h
or
w
!=
im_w
:
canvas
=
np
.
ones
((
h
,
w
,
3
),
dtype
=
np
.
float32
)
canvas
*=
np
.
array
([
114.0
,
114.0
,
114.0
],
dtype
=
np
.
float32
)
canvas
[
0
:
im_h
,
0
:
im_w
,
:]
=
img
.
astype
(
np
.
float32
)
img
=
canvas
img
=
np
.
transpose
(
img
/
255
,
[
2
,
0
,
1
])
scale_factor
=
np
.
array
([
im_scale_y
,
im_scale_x
])
return
img
.
astype
(
np
.
float32
),
scale_factor
class
COCOTrainDataset
(
COCOValDataset
):
def
__getitem__
(
self
,
idx
):
img_id
=
self
.
ids
[
idx
]
img
=
self
.
_get_img_data_from_img_id
(
img_id
)
img
,
scale_factor
=
self
.
image_preprocess
(
img
,
self
.
img_size
)
return
{
'x2paddle_images'
:
img
}
example/auto_compression/pytorch_yolov5/eval.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
numpy
as
np
import
argparse
from
tqdm
import
tqdm
import
paddle
from
paddleslim.common
import
load_config
as
load_slim_config
from
paddleslim.common
import
load_onnx_model
from
post_process
import
YOLOv5PostProcess
,
coco_metric
from
dataset
import
COCOValDataset
def
argsparser
():
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
None
,
help
=
"path of compression strategy config."
,
required
=
True
)
parser
.
add_argument
(
'--devices'
,
type
=
str
,
default
=
'gpu'
,
help
=
"which device used to compress."
)
return
parser
def
eval
():
place
=
paddle
.
CUDAPlace
(
0
)
if
FLAGS
.
devices
==
'gpu'
else
paddle
.
CPUPlace
()
exe
=
paddle
.
static
.
Executor
(
place
)
val_program
,
feed_target_names
,
fetch_targets
=
load_onnx_model
(
global_config
[
"model_dir"
])
bboxes_list
,
bbox_nums_list
,
image_id_list
=
[],
[],
[]
with
tqdm
(
total
=
len
(
val_loader
),
bar_format
=
'Evaluation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}'
,
ncols
=
80
)
as
t
:
for
data
in
val_loader
:
data_all
=
{
k
:
np
.
array
(
v
)
for
k
,
v
in
data
.
items
()}
outs
=
exe
.
run
(
val_program
,
feed
=
{
feed_target_names
[
0
]:
data_all
[
'image'
]},
fetch_list
=
fetch_targets
,
return_numpy
=
False
)
res
=
{}
postprocess
=
YOLOv5PostProcess
(
score_threshold
=
0.001
,
nms_threshold
=
0.65
,
multi_label
=
True
)
res
=
postprocess
(
np
.
array
(
outs
[
0
]),
data_all
[
'scale_factor'
])
bboxes_list
.
append
(
res
[
'bbox'
])
bbox_nums_list
.
append
(
res
[
'bbox_num'
])
image_id_list
.
append
(
np
.
array
(
data_all
[
'im_id'
]))
t
.
update
()
coco_metric
(
anno_file
,
bboxes_list
,
bbox_nums_list
,
image_id_list
)
def
main
():
global
global_config
all_config
=
load_slim_config
(
FLAGS
.
config_path
)
global_config
=
all_config
[
"Global"
]
global
val_loader
dataset
=
COCOValDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'val_image_dir'
],
anno_path
=
global_config
[
'val_anno_path'
])
global
anno_file
anno_file
=
dataset
.
ann_file
val_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
)
eval
()
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
parser
=
argsparser
()
FLAGS
=
parser
.
parse_args
()
assert
FLAGS
.
devices
in
[
'cpu'
,
'gpu'
,
'xpu'
,
'npu'
]
paddle
.
set_device
(
FLAGS
.
devices
)
main
()
example/auto_compression/pytorch_yolov5/paddle_trt_infer.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
cv2
import
numpy
as
np
import
argparse
import
time
from
paddle.inference
import
Config
from
paddle.inference
import
create_predictor
from
post_process
import
YOLOv5PostProcess
CLASS_LABEL
=
[
'person'
,
'bicycle'
,
'car'
,
'motorcycle'
,
'airplane'
,
'bus'
,
'train'
,
'truck'
,
'boat'
,
'traffic light'
,
'fire hydrant'
,
'stop sign'
,
'parking meter'
,
'bench'
,
'bird'
,
'cat'
,
'dog'
,
'horse'
,
'sheep'
,
'cow'
,
'elephant'
,
'bear'
,
'zebra'
,
'giraffe'
,
'backpack'
,
'umbrella'
,
'handbag'
,
'tie'
,
'suitcase'
,
'frisbee'
,
'skis'
,
'snowboard'
,
'sports ball'
,
'kite'
,
'baseball bat'
,
'baseball glove'
,
'skateboard'
,
'surfboard'
,
'tennis racket'
,
'bottle'
,
'wine glass'
,
'cup'
,
'fork'
,
'knife'
,
'spoon'
,
'bowl'
,
'banana'
,
'apple'
,
'sandwich'
,
'orange'
,
'broccoli'
,
'carrot'
,
'hot dog'
,
'pizza'
,
'donut'
,
'cake'
,
'chair'
,
'couch'
,
'potted plant'
,
'bed'
,
'dining table'
,
'toilet'
,
'tv'
,
'laptop'
,
'mouse'
,
'remote'
,
'keyboard'
,
'cell phone'
,
'microwave'
,
'oven'
,
'toaster'
,
'sink'
,
'refrigerator'
,
'book'
,
'clock'
,
'vase'
,
'scissors'
,
'teddy bear'
,
'hair drier'
,
'toothbrush'
]
def
generate_scale
(
im
,
target_shape
,
keep_ratio
=
True
):
"""
Args:
im (np.ndarray): image (np.ndarray)
Returns:
im_scale_x: the resize ratio of X
im_scale_y: the resize ratio of Y
"""
origin_shape
=
im
.
shape
[:
2
]
if
keep_ratio
:
im_size_min
=
np
.
min
(
origin_shape
)
im_size_max
=
np
.
max
(
origin_shape
)
target_size_min
=
np
.
min
(
target_shape
)
target_size_max
=
np
.
max
(
target_shape
)
im_scale
=
float
(
target_size_min
)
/
float
(
im_size_min
)
if
np
.
round
(
im_scale
*
im_size_max
)
>
target_size_max
:
im_scale
=
float
(
target_size_max
)
/
float
(
im_size_max
)
im_scale_x
=
im_scale
im_scale_y
=
im_scale
else
:
resize_h
,
resize_w
=
target_shape
im_scale_y
=
resize_h
/
float
(
origin_shape
[
0
])
im_scale_x
=
resize_w
/
float
(
origin_shape
[
1
])
return
im_scale_y
,
im_scale_x
def
image_preprocess
(
img_path
,
target_shape
):
img
=
cv2
.
imread
(
img_path
)
# Resize
im_scale_y
,
im_scale_x
=
generate_scale
(
img
,
target_shape
)
img
=
cv2
.
resize
(
img
,
None
,
None
,
fx
=
im_scale_x
,
fy
=
im_scale_y
,
interpolation
=
cv2
.
INTER_LINEAR
)
# Pad
im_h
,
im_w
=
img
.
shape
[:
2
]
h
,
w
=
target_shape
[:]
if
h
!=
im_h
or
w
!=
im_w
:
canvas
=
np
.
ones
((
h
,
w
,
3
),
dtype
=
np
.
float32
)
canvas
*=
np
.
array
([
114.0
,
114.0
,
114.0
],
dtype
=
np
.
float32
)
canvas
[
0
:
im_h
,
0
:
im_w
,
:]
=
img
.
astype
(
np
.
float32
)
img
=
canvas
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
)
img
=
np
.
transpose
(
img
,
[
2
,
0
,
1
])
/
255
img
=
np
.
expand_dims
(
img
,
0
)
scale_factor
=
np
.
array
([[
im_scale_y
,
im_scale_x
]])
return
img
.
astype
(
np
.
float32
),
scale_factor
def
get_color_map_list
(
num_classes
):
color_map
=
num_classes
*
[
0
,
0
,
0
]
for
i
in
range
(
0
,
num_classes
):
j
=
0
lab
=
i
while
lab
:
color_map
[
i
*
3
]
|=
(((
lab
>>
0
)
&
1
)
<<
(
7
-
j
))
color_map
[
i
*
3
+
1
]
|=
(((
lab
>>
1
)
&
1
)
<<
(
7
-
j
))
color_map
[
i
*
3
+
2
]
|=
(((
lab
>>
2
)
&
1
)
<<
(
7
-
j
))
j
+=
1
lab
>>=
3
color_map
=
[
color_map
[
i
:
i
+
3
]
for
i
in
range
(
0
,
len
(
color_map
),
3
)]
return
color_map
def
draw_box
(
image_file
,
results
,
class_label
,
threshold
=
0.5
):
srcimg
=
cv2
.
imread
(
image_file
,
1
)
for
i
in
range
(
len
(
results
)):
color_list
=
get_color_map_list
(
len
(
class_label
))
clsid2color
=
{}
classid
,
conf
=
int
(
results
[
i
,
0
]),
results
[
i
,
1
]
if
conf
<
threshold
:
continue
xmin
,
ymin
,
xmax
,
ymax
=
int
(
results
[
i
,
2
]),
int
(
results
[
i
,
3
]),
int
(
results
[
i
,
4
]),
int
(
results
[
i
,
5
])
if
classid
not
in
clsid2color
:
clsid2color
[
classid
]
=
color_list
[
classid
]
color
=
tuple
(
clsid2color
[
classid
])
cv2
.
rectangle
(
srcimg
,
(
xmin
,
ymin
),
(
xmax
,
ymax
),
color
,
thickness
=
2
)
print
(
class_label
[
classid
]
+
': '
+
str
(
round
(
conf
,
3
)))
cv2
.
putText
(
srcimg
,
class_label
[
classid
]
+
':'
+
str
(
round
(
conf
,
3
)),
(
xmin
,
ymin
-
10
),
cv2
.
FONT_HERSHEY_SIMPLEX
,
0.8
,
(
0
,
255
,
0
),
thickness
=
2
)
return
srcimg
def
load_predictor
(
model_dir
,
run_mode
=
'paddle'
,
batch_size
=
1
,
device
=
'CPU'
,
min_subgraph_size
=
3
,
use_dynamic_shape
=
False
,
trt_min_shape
=
1
,
trt_max_shape
=
1280
,
trt_opt_shape
=
640
,
trt_calib_mode
=
False
,
cpu_threads
=
1
,
enable_mkldnn
=
False
,
enable_mkldnn_bfloat16
=
False
,
delete_shuffle_pass
=
False
):
"""set AnalysisConfig, generate AnalysisPredictor
Args:
model_dir (str): root path of __model__ and __params__
device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
run_mode (str): mode of running(paddle/trt_fp32/trt_fp16/trt_int8)
use_dynamic_shape (bool): use dynamic shape or not
trt_min_shape (int): min shape for dynamic shape in trt
trt_max_shape (int): max shape for dynamic shape in trt
trt_opt_shape (int): opt shape for dynamic shape in trt
trt_calib_mode (bool): If the model is produced by TRT offline quantitative
calibration, trt_calib_mode need to set True
delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT.
Used by action model.
Returns:
predictor (PaddlePredictor): AnalysisPredictor
Raises:
ValueError: predict by TensorRT need device == 'GPU'.
"""
if
device
!=
'GPU'
and
run_mode
!=
'paddle'
:
raise
ValueError
(
"Predict by TensorRT mode: {}, expect device=='GPU', but device == {}"
.
format
(
run_mode
,
device
))
config
=
Config
(
os
.
path
.
join
(
model_dir
,
'model.pdmodel'
),
os
.
path
.
join
(
model_dir
,
'model.pdiparams'
))
if
device
==
'GPU'
:
# initial GPU memory(M), device ID
config
.
enable_use_gpu
(
200
,
0
)
# optimize graph and fuse op
config
.
switch_ir_optim
(
True
)
elif
device
==
'XPU'
:
config
.
enable_lite_engine
()
config
.
enable_xpu
(
10
*
1024
*
1024
)
else
:
config
.
disable_gpu
()
config
.
set_cpu_math_library_num_threads
(
cpu_threads
)
if
enable_mkldnn
:
try
:
# cache 10 different shapes for mkldnn to avoid memory leak
config
.
set_mkldnn_cache_capacity
(
10
)
config
.
enable_mkldnn
()
if
enable_mkldnn_bfloat16
:
config
.
enable_mkldnn_bfloat16
()
except
Exception
as
e
:
print
(
"The current environment does not support `mkldnn`, so disable mkldnn."
)
pass
precision_map
=
{
'trt_int8'
:
Config
.
Precision
.
Int8
,
'trt_fp32'
:
Config
.
Precision
.
Float32
,
'trt_fp16'
:
Config
.
Precision
.
Half
}
if
run_mode
in
precision_map
.
keys
():
config
.
enable_tensorrt_engine
(
workspace_size
=
(
1
<<
25
)
*
batch_size
,
max_batch_size
=
batch_size
,
min_subgraph_size
=
min_subgraph_size
,
precision_mode
=
precision_map
[
run_mode
],
use_static
=
False
,
use_calib_mode
=
trt_calib_mode
)
if
use_dynamic_shape
:
min_input_shape
=
{
'image'
:
[
batch_size
,
3
,
trt_min_shape
,
trt_min_shape
]
}
max_input_shape
=
{
'image'
:
[
batch_size
,
3
,
trt_max_shape
,
trt_max_shape
]
}
opt_input_shape
=
{
'image'
:
[
batch_size
,
3
,
trt_opt_shape
,
trt_opt_shape
]
}
config
.
set_trt_dynamic_shape_info
(
min_input_shape
,
max_input_shape
,
opt_input_shape
)
print
(
'trt set dynamic shape done!'
)
# disable print log when predict
config
.
disable_glog_info
()
# enable shared memory
config
.
enable_memory_optim
()
# disable feed, fetch OP, needed by zero_copy_run
config
.
switch_use_feed_fetch_ops
(
False
)
if
delete_shuffle_pass
:
config
.
delete_pass
(
"shuffle_channel_detect_pass"
)
predictor
=
create_predictor
(
config
)
return
predictor
def
predict_image
(
predictor
,
image_file
,
image_shape
=
[
640
,
640
],
warmup
=
1
,
repeats
=
1
,
threshold
=
0.5
):
img
,
scale_factor
=
image_preprocess
(
image_file
,
image_shape
)
inputs
[
'x2paddle_images'
]
=
img
input_names
=
predictor
.
get_input_names
()
for
i
in
range
(
len
(
input_names
)):
input_tensor
=
predictor
.
get_input_handle
(
input_names
[
i
])
input_tensor
.
copy_from_cpu
(
inputs
[
input_names
[
i
]])
for
i
in
range
(
warmup
):
predictor
.
run
()
np_boxes
=
None
predict_time
=
0.
time_min
=
float
(
"inf"
)
time_max
=
float
(
'-inf'
)
for
i
in
range
(
repeats
):
start_time
=
time
.
time
()
predictor
.
run
()
output_names
=
predictor
.
get_output_names
()
boxes_tensor
=
predictor
.
get_output_handle
(
output_names
[
0
])
np_boxes
=
boxes_tensor
.
copy_to_cpu
()
end_time
=
time
.
time
()
timed
=
end_time
-
start_time
time_min
=
min
(
time_min
,
timed
)
time_max
=
max
(
time_max
,
timed
)
predict_time
+=
timed
time_avg
=
predict_time
/
repeats
print
(
'Inference time(ms): min={}, max={}, avg={}'
.
format
(
round
(
time_min
*
1000
,
2
),
round
(
time_max
*
1000
,
1
),
round
(
time_avg
*
1000
,
1
)))
postprocess
=
YOLOv5PostProcess
(
score_threshold
=
0.001
,
nms_threshold
=
0.6
,
multi_label
=
True
)
res
=
postprocess
(
np_boxes
,
scale_factor
)
res_img
=
draw_box
(
image_file
,
res
[
'bbox'
],
CLASS_LABEL
,
threshold
=
threshold
)
cv2
.
imwrite
(
'result.jpg'
,
res_img
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--image_file'
,
type
=
str
,
default
=
None
,
help
=
"image path"
)
parser
.
add_argument
(
'--model_path'
,
type
=
str
,
help
=
"inference model filepath"
)
parser
.
add_argument
(
'--benchmark'
,
type
=
bool
,
default
=
False
,
help
=
"Whether run benchmark or not."
)
parser
.
add_argument
(
'--run_mode'
,
type
=
str
,
default
=
'paddle'
,
help
=
"mode of running(paddle/trt_fp32/trt_fp16/trt_int8)"
)
parser
.
add_argument
(
'--device'
,
type
=
str
,
default
=
'GPU'
,
help
=
"Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU"
)
parser
.
add_argument
(
'--img_shape'
,
type
=
int
,
default
=
640
,
help
=
"input_size"
)
args
=
parser
.
parse_args
()
predictor
=
load_predictor
(
args
.
model_path
,
run_mode
=
args
.
run_mode
,
device
=
args
.
device
)
warmup
,
repeats
=
1
,
1
if
args
.
benchmark
:
warmup
,
repeats
=
50
,
100
predict_image
(
predictor
,
args
.
image_file
,
image_shape
=
[
args
.
img_shape
,
args
.
img_shape
],
warmup
=
warmup
,
repeats
=
repeats
)
example/auto_compression/pytorch_yolov5/post_process.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
cv2
import
json
import
sys
def
box_area
(
boxes
):
"""
Args:
boxes(np.ndarray): [N, 4]
return: [N]
"""
return
(
boxes
[:,
2
]
-
boxes
[:,
0
])
*
(
boxes
[:,
3
]
-
boxes
[:,
1
])
def
box_iou
(
box1
,
box2
):
"""
Args:
box1(np.ndarray): [N, 4]
box2(np.ndarray): [M, 4]
return: [N, M]
"""
area1
=
box_area
(
box1
)
area2
=
box_area
(
box2
)
lt
=
np
.
maximum
(
box1
[:,
np
.
newaxis
,
:
2
],
box2
[:,
:
2
])
rb
=
np
.
minimum
(
box1
[:,
np
.
newaxis
,
2
:],
box2
[:,
2
:])
wh
=
rb
-
lt
wh
=
np
.
maximum
(
0
,
wh
)
inter
=
wh
[:,
:,
0
]
*
wh
[:,
:,
1
]
iou
=
inter
/
(
area1
[:,
np
.
newaxis
]
+
area2
-
inter
)
return
iou
def
nms
(
boxes
,
scores
,
iou_threshold
):
"""
Non Max Suppression numpy implementation.
args:
boxes(np.ndarray): [N, 4]
scores(np.ndarray): [N, 1]
iou_threshold(float): Threshold of IoU.
"""
idxs
=
scores
.
argsort
()
keep
=
[]
while
idxs
.
size
>
0
:
max_score_index
=
idxs
[
-
1
]
max_score_box
=
boxes
[
max_score_index
][
None
,
:]
keep
.
append
(
max_score_index
)
if
idxs
.
size
==
1
:
break
idxs
=
idxs
[:
-
1
]
other_boxes
=
boxes
[
idxs
]
ious
=
box_iou
(
max_score_box
,
other_boxes
)
idxs
=
idxs
[
ious
[
0
]
<=
iou_threshold
]
keep
=
np
.
array
(
keep
)
return
keep
class
YOLOv5PostProcess
(
object
):
"""
Post process of YOLOv5 network.
args:
score_threshold(float): Threshold to filter out bounding boxes with low
confidence score. If not provided, consider all boxes.
nms_threshold(float): The threshold to be used in NMS.
multi_label(bool): Whether keep multi label in boxes.
keep_top_k(int): Number of total bboxes to be kept per image after NMS
step. -1 means keeping all bboxes after NMS step.
"""
def
__init__
(
self
,
score_threshold
=
0.25
,
nms_threshold
=
0.5
,
multi_label
=
False
,
keep_top_k
=
300
):
self
.
score_threshold
=
score_threshold
self
.
nms_threshold
=
nms_threshold
self
.
multi_label
=
multi_label
self
.
keep_top_k
=
keep_top_k
def
_xywh2xyxy
(
self
,
x
):
# Convert from [x, y, w, h] to [x1, y1, x2, y2]
y
=
np
.
copy
(
x
)
y
[:,
0
]
=
x
[:,
0
]
-
x
[:,
2
]
/
2
# top left x
y
[:,
1
]
=
x
[:,
1
]
-
x
[:,
3
]
/
2
# top left y
y
[:,
2
]
=
x
[:,
0
]
+
x
[:,
2
]
/
2
# bottom right x
y
[:,
3
]
=
x
[:,
1
]
+
x
[:,
3
]
/
2
# bottom right y
return
y
def
_non_max_suppression
(
self
,
prediction
):
max_wh
=
4096
# (pixels) minimum and maximum box width and height
nms_top_k
=
30000
cand_boxes
=
prediction
[...,
4
]
>
self
.
score_threshold
# candidates
output
=
[
np
.
zeros
((
0
,
6
))]
*
prediction
.
shape
[
0
]
for
batch_id
,
boxes
in
enumerate
(
prediction
):
# Apply constraints
boxes
=
boxes
[
cand_boxes
[
batch_id
]]
if
not
boxes
.
shape
[
0
]:
continue
# Compute conf (conf = obj_conf * cls_conf)
boxes
[:,
5
:]
*=
boxes
[:,
4
:
5
]
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
convert_box
=
self
.
_xywh2xyxy
(
boxes
[:,
:
4
])
# Detections matrix nx6 (xyxy, conf, cls)
if
self
.
multi_label
:
i
,
j
=
(
boxes
[:,
5
:]
>
self
.
score_threshold
).
nonzero
()
boxes
=
np
.
concatenate
(
(
convert_box
[
i
],
boxes
[
i
,
j
+
5
,
None
],
j
[:,
None
].
astype
(
np
.
float32
)),
axis
=
1
)
else
:
conf
=
np
.
max
(
boxes
[:,
5
:],
axis
=
1
)
j
=
np
.
argmax
(
boxes
[:,
5
:],
axis
=
1
)
re
=
np
.
array
(
conf
.
reshape
(
-
1
)
>
self
.
score_threshold
)
conf
=
conf
.
reshape
(
-
1
,
1
)
j
=
j
.
reshape
(
-
1
,
1
)
boxes
=
np
.
concatenate
((
convert_box
,
conf
,
j
),
axis
=
1
)[
re
]
num_box
=
boxes
.
shape
[
0
]
if
not
num_box
:
continue
elif
num_box
>
nms_top_k
:
boxes
=
boxes
[
boxes
[:,
4
].
argsort
()[::
-
1
][:
nms_top_k
]]
# Batched NMS
c
=
boxes
[:,
5
:
6
]
*
max_wh
clean_boxes
,
scores
=
boxes
[:,
:
4
]
+
c
,
boxes
[:,
4
]
keep
=
nms
(
clean_boxes
,
scores
,
self
.
nms_threshold
)
# limit detection box num
if
keep
.
shape
[
0
]
>
self
.
keep_top_k
:
keep
=
keep
[:
self
.
keep_top_k
]
output
[
batch_id
]
=
boxes
[
keep
]
return
output
def
__call__
(
self
,
outs
,
scale_factor
):
preds
=
self
.
_non_max_suppression
(
outs
)
bboxs
,
box_nums
=
[],
[]
for
i
,
pred
in
enumerate
(
preds
):
if
len
(
pred
.
shape
)
>
2
:
pred
=
np
.
squeeze
(
pred
)
if
len
(
pred
.
shape
)
==
1
:
pred
=
pred
[
np
.
newaxis
,
:]
pred_bboxes
=
pred
[:,
:
4
]
scale_factor
=
np
.
tile
(
scale_factor
[
i
][::
-
1
],
(
1
,
2
))
pred_bboxes
/=
scale_factor
bbox
=
np
.
concatenate
(
[
pred
[:,
-
1
][:,
np
.
newaxis
],
pred
[:,
-
2
][:,
np
.
newaxis
],
pred_bboxes
],
axis
=-
1
)
bboxs
.
append
(
bbox
)
box_num
=
bbox
.
shape
[
0
]
box_nums
.
append
(
box_num
)
bboxs
=
np
.
concatenate
(
bboxs
,
axis
=
0
)
box_nums
=
np
.
array
(
box_nums
)
return
{
'bbox'
:
bboxs
,
'bbox_num'
:
box_nums
}
def
coco_metric
(
anno_file
,
bboxes_list
,
bbox_nums_list
,
image_id_list
):
try
:
from
pycocotools.coco
import
COCO
from
pycocotools.cocoeval
import
COCOeval
except
:
print
(
"[ERROR] Not found pycocotools, please install by `pip install pycocotools`"
)
sys
.
exit
(
1
)
coco_gt
=
COCO
(
anno_file
)
cats
=
coco_gt
.
loadCats
(
coco_gt
.
getCatIds
())
clsid2catid
=
{
i
:
cat
[
'id'
]
for
i
,
cat
in
enumerate
(
cats
)}
results
=
[]
for
bboxes
,
bbox_nums
,
image_id
in
zip
(
bboxes_list
,
bbox_nums_list
,
image_id_list
):
results
+=
_get_det_res
(
bboxes
,
bbox_nums
,
image_id
,
clsid2catid
)
output
=
"bbox.json"
with
open
(
output
,
'w'
)
as
f
:
json
.
dump
(
results
,
f
)
coco_dt
=
coco_gt
.
loadRes
(
output
)
coco_eval
=
COCOeval
(
coco_gt
,
coco_dt
,
'bbox'
)
coco_eval
.
evaluate
()
coco_eval
.
accumulate
()
coco_eval
.
summarize
()
return
coco_eval
.
stats
def
_get_det_res
(
bboxes
,
bbox_nums
,
image_id
,
label_to_cat_id_map
):
det_res
=
[]
k
=
0
for
i
in
range
(
len
(
bbox_nums
)):
cur_image_id
=
int
(
image_id
[
i
][
0
])
det_nums
=
bbox_nums
[
i
]
for
j
in
range
(
det_nums
):
dt
=
bboxes
[
k
]
k
=
k
+
1
num_id
,
score
,
xmin
,
ymin
,
xmax
,
ymax
=
dt
.
tolist
()
if
int
(
num_id
)
<
0
:
continue
category_id
=
label_to_cat_id_map
[
int
(
num_id
)]
w
=
xmax
-
xmin
h
=
ymax
-
ymin
bbox
=
[
xmin
,
ymin
,
w
,
h
]
dt_res
=
{
'image_id'
:
cur_image_id
,
'category_id'
:
category_id
,
'bbox'
:
bbox
,
'score'
:
score
}
det_res
.
append
(
dt_res
)
return
det_res
example/auto_compression/pytorch_yolov5/post_quant.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
numpy
as
np
import
argparse
import
paddle
from
paddleslim.common
import
load_config
from
paddleslim.common
import
load_onnx_model
from
paddleslim.quant
import
quant_post_static
from
dataset
import
COCOTrainDataset
def
argsparser
():
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
None
,
help
=
"path of compression strategy config."
,
required
=
True
)
parser
.
add_argument
(
'--save_dir'
,
type
=
str
,
default
=
'ptq_out'
,
help
=
"directory to save compressed model."
)
parser
.
add_argument
(
'--devices'
,
type
=
str
,
default
=
'gpu'
,
help
=
"which device used to compress."
)
parser
.
add_argument
(
'--algo'
,
type
=
str
,
default
=
'KL'
,
help
=
"post quant algo."
)
return
parser
def
main
():
global
global_config
all_config
=
load_config
(
FLAGS
.
config_path
)
global_config
=
all_config
[
"Global"
]
dataset
=
COCOTrainDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'val_image_dir'
],
anno_path
=
global_config
[
'val_anno_path'
])
train_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
0
)
place
=
paddle
.
CUDAPlace
(
0
)
if
FLAGS
.
devices
==
'gpu'
else
paddle
.
CPUPlace
()
exe
=
paddle
.
static
.
Executor
(
place
)
load_onnx_model
(
global_config
[
"model_dir"
])
inference_model_path
=
global_config
[
"model_dir"
].
rstrip
().
rstrip
(
'.onnx'
)
+
'_infer'
quant_post_static
(
executor
=
exe
,
model_dir
=
inference_model_path
,
quantize_model_path
=
FLAGS
.
save_dir
,
data_loader
=
train_loader
,
model_filename
=
'model.pdmodel'
,
params_filename
=
'model.pdiparams'
,
batch_size
=
32
,
batch_nums
=
10
,
algo
=
FLAGS
.
algo
,
hist_percent
=
0.999
,
is_full_quantize
=
False
,
bias_correction
=
False
,
onnx_format
=
True
)
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
parser
=
argsparser
()
FLAGS
=
parser
.
parse_args
()
assert
FLAGS
.
devices
in
[
'cpu'
,
'gpu'
,
'xpu'
,
'npu'
]
paddle
.
set_device
(
FLAGS
.
devices
)
main
()
example/auto_compression/pytorch_yolov5/run.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
numpy
as
np
import
argparse
from
tqdm
import
tqdm
import
paddle
from
paddleslim.common
import
load_config
as
load_slim_config
from
paddleslim.auto_compression
import
AutoCompression
from
dataset
import
COCOValDataset
,
COCOTrainDataset
from
post_process
import
YOLOv5PostProcess
,
coco_metric
def
argsparser
():
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
None
,
help
=
"path of compression strategy config."
,
required
=
True
)
parser
.
add_argument
(
'--save_dir'
,
type
=
str
,
default
=
'output'
,
help
=
"directory to save compressed model."
)
parser
.
add_argument
(
'--devices'
,
type
=
str
,
default
=
'gpu'
,
help
=
"which device used to compress."
)
parser
.
add_argument
(
'--eval'
,
type
=
bool
,
default
=
False
,
help
=
"whether to run evaluation."
)
return
parser
def
eval_function
(
exe
,
compiled_test_program
,
test_feed_names
,
test_fetch_list
):
bboxes_list
,
bbox_nums_list
,
image_id_list
=
[],
[],
[]
with
tqdm
(
total
=
len
(
val_loader
),
bar_format
=
'Evaluation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}'
,
ncols
=
80
)
as
t
:
for
data
in
val_loader
:
data_all
=
{
k
:
np
.
array
(
v
)
for
k
,
v
in
data
.
items
()}
outs
=
exe
.
run
(
compiled_test_program
,
feed
=
{
test_feed_names
[
0
]:
data_all
[
'image'
]},
fetch_list
=
test_fetch_list
,
return_numpy
=
False
)
res
=
{}
postprocess
=
YOLOv5PostProcess
(
score_threshold
=
0.001
,
nms_threshold
=
0.65
,
multi_label
=
True
)
res
=
postprocess
(
np
.
array
(
outs
[
0
]),
data_all
[
'scale_factor'
])
bboxes_list
.
append
(
res
[
'bbox'
])
bbox_nums_list
.
append
(
res
[
'bbox_num'
])
image_id_list
.
append
(
np
.
array
(
data_all
[
'im_id'
]))
t
.
update
()
map_res
=
coco_metric
(
anno_file
,
bboxes_list
,
bbox_nums_list
,
image_id_list
)
return
map_res
[
0
]
def
main
():
global
global_config
all_config
=
load_slim_config
(
FLAGS
.
config_path
)
assert
"Global"
in
all_config
,
"Key 'Global' not found in config file.
\n
{}"
.
format
(
all_config
)
global_config
=
all_config
[
"Global"
]
dataset
=
COCOTrainDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'train_image_dir'
],
anno_path
=
global_config
[
'train_anno_path'
])
train_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
0
)
if
'Evaluation'
in
global_config
.
keys
()
and
global_config
[
'Evaluation'
]
and
paddle
.
distributed
.
get_rank
()
==
0
:
eval_func
=
eval_function
global
val_loader
dataset
=
COCOValDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'val_image_dir'
],
anno_path
=
global_config
[
'val_anno_path'
])
global
anno_file
anno_file
=
dataset
.
ann_file
val_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
False
,
drop_last
=
False
,
num_workers
=
0
)
else
:
eval_func
=
None
ac
=
AutoCompression
(
model_dir
=
global_config
[
"model_dir"
],
train_dataloader
=
train_loader
,
save_dir
=
FLAGS
.
save_dir
,
config
=
all_config
,
eval_callback
=
eval_func
)
ac
.
compress
()
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
parser
=
argsparser
()
FLAGS
=
parser
.
parse_args
()
assert
FLAGS
.
devices
in
[
'cpu'
,
'gpu'
,
'xpu'
,
'npu'
]
paddle
.
set_device
(
FLAGS
.
devices
)
main
()
example/auto_compression/pytorch_yolov5/yolov5_onnx_trt.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
cv2
import
tensorrt
as
trt
import
pycuda.driver
as
cuda
import
pycuda.autoinit
import
os
import
time
import
random
import
argparse
EXPLICIT_BATCH
=
1
<<
(
int
)(
trt
.
NetworkDefinitionCreationFlag
.
EXPLICIT_BATCH
)
EXPLICIT_PRECISION
=
1
<<
(
int
)(
trt
.
NetworkDefinitionCreationFlag
.
EXPLICIT_PRECISION
)
# load coco labels
CLASS_LABEL
=
[
"person"
,
"bicycle"
,
"car"
,
"motorcycle"
,
"airplane"
,
"bus"
,
"train"
,
"truck"
,
"boat"
,
"traffic light"
,
"fire hydrant"
,
"stop sign"
,
"parking meter"
,
"bench"
,
"bird"
,
"cat"
,
"dog"
,
"horse"
,
"sheep"
,
"cow"
,
"elephant"
,
"bear"
,
"zebra"
,
"giraffe"
,
"backpack"
,
"umbrella"
,
"handbag"
,
"tie"
,
"suitcase"
,
"frisbee"
,
"skis"
,
"snowboard"
,
"sports ball"
,
"kite"
,
"baseball bat"
,
"baseball glove"
,
"skateboard"
,
"surfboard"
,
"tennis racket"
,
"bottle"
,
"wine glass"
,
"cup"
,
"fork"
,
"knife"
,
"spoon"
,
"bowl"
,
"banana"
,
"apple"
,
"sandwich"
,
"orange"
,
"broccoli"
,
"carrot"
,
"hot dog"
,
"pizza"
,
"donut"
,
"cake"
,
"chair"
,
"couch"
,
"potted plant"
,
"bed"
,
"dining table"
,
"toilet"
,
"tv"
,
"laptop"
,
"mouse"
,
"remote"
,
"keyboard"
,
"cell phone"
,
"microwave"
,
"oven"
,
"toaster"
,
"sink"
,
"refrigerator"
,
"book"
,
"clock"
,
"vase"
,
"scissors"
,
"teddy bear"
,
"hair drier"
,
"toothbrush"
]
def
preprocess
(
image
,
input_size
,
mean
=
None
,
std
=
None
,
swap
=
(
2
,
0
,
1
)):
if
len
(
image
.
shape
)
==
3
:
padded_img
=
np
.
ones
((
input_size
[
0
],
input_size
[
1
],
3
))
*
114.0
else
:
padded_img
=
np
.
ones
(
input_size
)
*
114.0
img
=
np
.
array
(
image
)
r
=
min
(
input_size
[
0
]
/
img
.
shape
[
0
],
input_size
[
1
]
/
img
.
shape
[
1
])
resized_img
=
cv2
.
resize
(
img
,
(
int
(
img
.
shape
[
1
]
*
r
),
int
(
img
.
shape
[
0
]
*
r
)),
interpolation
=
cv2
.
INTER_LINEAR
,
).
astype
(
np
.
float32
)
padded_img
[:
int
(
img
.
shape
[
0
]
*
r
),
:
int
(
img
.
shape
[
1
]
*
r
)]
=
resized_img
padded_img
=
padded_img
[:,
:,
::
-
1
]
padded_img
/=
255.0
if
mean
is
not
None
:
padded_img
-=
mean
if
std
is
not
None
:
padded_img
/=
std
padded_img
=
padded_img
.
transpose
(
swap
)
padded_img
=
np
.
ascontiguousarray
(
padded_img
,
dtype
=
np
.
float32
)
return
padded_img
,
r
def
postprocess
(
predictions
,
ratio
):
boxes
=
predictions
[:,
:
4
]
scores
=
predictions
[:,
4
:
5
]
*
predictions
[:,
5
:]
boxes_xyxy
=
np
.
ones_like
(
boxes
)
boxes_xyxy
[:,
0
]
=
boxes
[:,
0
]
-
boxes
[:,
2
]
/
2.
boxes_xyxy
[:,
1
]
=
boxes
[:,
1
]
-
boxes
[:,
3
]
/
2.
boxes_xyxy
[:,
2
]
=
boxes
[:,
0
]
+
boxes
[:,
2
]
/
2.
boxes_xyxy
[:,
3
]
=
boxes
[:,
1
]
+
boxes
[:,
3
]
/
2.
boxes_xyxy
/=
ratio
dets
=
multiclass_nms
(
boxes_xyxy
,
scores
,
nms_thr
=
0.45
,
score_thr
=
0.1
)
return
dets
def
nms
(
boxes
,
scores
,
nms_thr
):
"""Single class NMS implemented in Numpy."""
x1
=
boxes
[:,
0
]
y1
=
boxes
[:,
1
]
x2
=
boxes
[:,
2
]
y2
=
boxes
[:,
3
]
areas
=
(
x2
-
x1
+
1
)
*
(
y2
-
y1
+
1
)
order
=
scores
.
argsort
()[::
-
1
]
keep
=
[]
while
order
.
size
>
0
:
i
=
order
[
0
]
keep
.
append
(
i
)
xx1
=
np
.
maximum
(
x1
[
i
],
x1
[
order
[
1
:]])
yy1
=
np
.
maximum
(
y1
[
i
],
y1
[
order
[
1
:]])
xx2
=
np
.
minimum
(
x2
[
i
],
x2
[
order
[
1
:]])
yy2
=
np
.
minimum
(
y2
[
i
],
y2
[
order
[
1
:]])
w
=
np
.
maximum
(
0.0
,
xx2
-
xx1
+
1
)
h
=
np
.
maximum
(
0.0
,
yy2
-
yy1
+
1
)
inter
=
w
*
h
ovr
=
inter
/
(
areas
[
i
]
+
areas
[
order
[
1
:]]
-
inter
)
inds
=
np
.
where
(
ovr
<=
nms_thr
)[
0
]
order
=
order
[
inds
+
1
]
return
keep
def
multiclass_nms
(
boxes
,
scores
,
nms_thr
,
score_thr
):
"""Multiclass NMS implemented in Numpy"""
final_dets
=
[]
num_classes
=
scores
.
shape
[
1
]
for
cls_ind
in
range
(
num_classes
):
cls_scores
=
scores
[:,
cls_ind
]
valid_score_mask
=
cls_scores
>
score_thr
if
valid_score_mask
.
sum
()
==
0
:
continue
else
:
valid_scores
=
cls_scores
[
valid_score_mask
]
valid_boxes
=
boxes
[
valid_score_mask
]
keep
=
nms
(
valid_boxes
,
valid_scores
,
nms_thr
)
if
len
(
keep
)
>
0
:
cls_inds
=
np
.
ones
((
len
(
keep
),
1
))
*
cls_ind
dets
=
np
.
concatenate
(
[
valid_boxes
[
keep
],
valid_scores
[
keep
,
None
],
cls_inds
],
1
)
final_dets
.
append
(
dets
)
if
len
(
final_dets
)
==
0
:
return
None
return
np
.
concatenate
(
final_dets
,
0
)
def
get_color_map_list
(
num_classes
):
color_map
=
num_classes
*
[
0
,
0
,
0
]
for
i
in
range
(
0
,
num_classes
):
j
=
0
lab
=
i
while
lab
:
color_map
[
i
*
3
]
|=
(((
lab
>>
0
)
&
1
)
<<
(
7
-
j
))
color_map
[
i
*
3
+
1
]
|=
(((
lab
>>
1
)
&
1
)
<<
(
7
-
j
))
color_map
[
i
*
3
+
2
]
|=
(((
lab
>>
2
)
&
1
)
<<
(
7
-
j
))
j
+=
1
lab
>>=
3
color_map
=
[
color_map
[
i
:
i
+
3
]
for
i
in
range
(
0
,
len
(
color_map
),
3
)]
return
color_map
def
draw_box
(
img
,
boxes
,
scores
,
cls_ids
,
conf
=
0.5
,
class_names
=
None
):
color_list
=
get_color_map_list
(
len
(
class_names
))
for
i
in
range
(
len
(
boxes
)):
box
=
boxes
[
i
]
cls_id
=
int
(
cls_ids
[
i
])
color
=
tuple
(
color_list
[
cls_id
])
score
=
scores
[
i
]
if
score
<
conf
:
continue
x0
=
int
(
box
[
0
])
y0
=
int
(
box
[
1
])
x1
=
int
(
box
[
2
])
y1
=
int
(
box
[
3
])
text
=
'{}:{:.1f}%'
.
format
(
class_names
[
cls_id
],
score
*
100
)
font
=
cv2
.
FONT_HERSHEY_SIMPLEX
txt_size
=
cv2
.
getTextSize
(
text
,
font
,
0.4
,
1
)[
0
]
cv2
.
rectangle
(
img
,
(
x0
,
y0
),
(
x1
,
y1
),
color
,
2
)
cv2
.
rectangle
(
img
,
(
x0
,
y0
+
1
),
(
x0
+
txt_size
[
0
]
+
1
,
y0
+
int
(
1.5
*
txt_size
[
1
])),
color
,
-
1
)
cv2
.
putText
(
img
,
text
,
(
x0
,
y0
+
txt_size
[
1
]),
font
,
0.8
,
(
0
,
255
,
0
),
thickness
=
2
)
return
img
def
get_engine
(
precision
,
model_file_path
):
# TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
TRT_LOGGER
=
trt
.
Logger
()
builder
=
trt
.
Builder
(
TRT_LOGGER
)
config
=
builder
.
create_builder_config
()
if
precision
==
'int8'
:
network
=
builder
.
create_network
(
EXPLICIT_BATCH
|
EXPLICIT_PRECISION
)
else
:
network
=
builder
.
create_network
(
EXPLICIT_BATCH
)
parser
=
trt
.
OnnxParser
(
network
,
TRT_LOGGER
)
runtime
=
trt
.
Runtime
(
TRT_LOGGER
)
if
model_file_path
.
endswith
(
'.trt'
):
# If a serialized engine exists, use it instead of building an engine.
print
(
"Reading engine from file {}"
.
format
(
model_file_path
))
with
open
(
model_file_path
,
"rb"
)
as
f
,
trt
.
Runtime
(
TRT_LOGGER
)
as
runtime
:
engine
=
runtime
.
deserialize_cuda_engine
(
f
.
read
())
for
i
in
range
(
network
.
num_layers
):
layer
=
network
.
get_layer
(
i
)
print
(
i
,
layer
.
name
)
return
engine
else
:
config
.
max_workspace_size
=
1
<<
30
if
precision
==
"fp16"
:
if
not
builder
.
platform_has_fast_fp16
:
print
(
"FP16 is not supported natively on this platform/device"
)
else
:
config
.
set_flag
(
trt
.
BuilderFlag
.
FP16
)
elif
precision
==
"int8"
:
if
not
builder
.
platform_has_fast_int8
:
print
(
"INT8 is not supported natively on this platform/device"
)
else
:
if
builder
.
platform_has_fast_fp16
:
# Also enable fp16, as some layers may be even more efficient in fp16 than int8
config
.
set_flag
(
trt
.
BuilderFlag
.
FP16
)
config
.
set_flag
(
trt
.
BuilderFlag
.
INT8
)
builder
.
max_batch_size
=
1
print
(
'Loading ONNX file from path {}...'
.
format
(
model_file_path
))
with
open
(
model_file_path
,
'rb'
)
as
model
:
print
(
'Beginning ONNX file parsing'
)
if
not
parser
.
parse
(
model
.
read
()):
print
(
'ERROR: Failed to parse the ONNX file.'
)
for
error
in
range
(
parser
.
num_errors
):
print
(
parser
.
get_error
(
error
))
return
None
print
(
'Completed parsing of ONNX file'
)
print
(
'Building an engine from file {}; this may take a while...'
.
format
(
model_file_path
))
plan
=
builder
.
build_serialized_network
(
network
,
config
)
engine
=
runtime
.
deserialize_cuda_engine
(
plan
)
print
(
"Completed creating Engine"
)
with
open
(
model_file_path
,
"wb"
)
as
f
:
f
.
write
(
engine
.
serialize
())
for
i
in
range
(
network
.
num_layers
):
layer
=
network
.
get_layer
(
i
)
print
(
i
,
layer
.
name
)
return
engine
# Simple helper data class that's a little nicer to use than a 2-tuple.
class
HostDeviceMem
(
object
):
def
__init__
(
self
,
host_mem
,
device_mem
):
self
.
host
=
host_mem
self
.
device
=
device_mem
def
__str__
(
self
):
return
"Host:
\n
"
+
str
(
self
.
host
)
+
"
\n
Device:
\n
"
+
str
(
self
.
device
)
def
__repr__
(
self
):
return
self
.
__str__
()
def
allocate_buffers
(
engine
):
inputs
=
[]
outputs
=
[]
bindings
=
[]
stream
=
cuda
.
Stream
()
for
binding
in
engine
:
size
=
trt
.
volume
(
engine
.
get_binding_shape
(
binding
))
*
engine
.
max_batch_size
dtype
=
trt
.
nptype
(
engine
.
get_binding_dtype
(
binding
))
# Allocate host and device buffers
host_mem
=
cuda
.
pagelocked_empty
(
size
,
dtype
)
device_mem
=
cuda
.
mem_alloc
(
host_mem
.
nbytes
)
# Append the device buffer to device bindings.
bindings
.
append
(
int
(
device_mem
))
# Append to the appropriate list.
if
engine
.
binding_is_input
(
binding
):
inputs
.
append
(
HostDeviceMem
(
host_mem
,
device_mem
))
else
:
outputs
.
append
(
HostDeviceMem
(
host_mem
,
device_mem
))
return
inputs
,
outputs
,
bindings
,
stream
def
run_inference
(
context
,
bindings
,
inputs
,
outputs
,
stream
):
# Transfer input data to the GPU.
[
cuda
.
memcpy_htod_async
(
inp
.
device
,
inp
.
host
,
stream
)
for
inp
in
inputs
]
# Run inference.
context
.
execute_async_v2
(
bindings
=
bindings
,
stream_handle
=
stream
.
handle
)
# Transfer predictions back from the GPU.
[
cuda
.
memcpy_dtoh_async
(
out
.
host
,
out
.
device
,
stream
)
for
out
in
outputs
]
# Synchronize the stream
stream
.
synchronize
()
# Return only the host outputs.
return
[
out
.
host
for
out
in
outputs
]
def
main
(
args
):
onnx_model
=
args
.
model_path
img_path
=
args
.
image_file
num_class
=
len
(
CLASS_LABEL
)
repeat
=
1000
engine
=
get_engine
(
args
.
precision
,
onnx_model
)
model_all_names
=
[]
for
idx
in
range
(
engine
.
num_bindings
):
is_input
=
engine
.
binding_is_input
(
idx
)
name
=
engine
.
get_binding_name
(
idx
)
op_type
=
engine
.
get_binding_dtype
(
idx
)
model_all_names
.
append
(
name
)
shape
=
engine
.
get_binding_shape
(
idx
)
print
(
'input id:'
,
idx
,
' is input: '
,
is_input
,
' binding name:'
,
name
,
' shape:'
,
shape
,
'type: '
,
op_type
)
context
=
engine
.
create_execution_context
()
print
(
'Allocate buffers ...'
)
inputs
,
outputs
,
bindings
,
stream
=
allocate_buffers
(
engine
)
print
(
"TRT set input ..."
)
origin_img
=
cv2
.
imread
(
img_path
)
input_shape
=
[
args
.
img_shape
,
args
.
img_shape
]
input_image
,
ratio
=
preprocess
(
origin_img
,
input_shape
)
inputs
[
0
].
host
=
np
.
expand_dims
(
input_image
,
axis
=
0
)
for
_
in
range
(
0
,
50
):
trt_outputs
=
run_inference
(
context
,
bindings
=
bindings
,
inputs
=
inputs
,
outputs
=
outputs
,
stream
=
stream
)
time1
=
time
.
time
()
for
_
in
range
(
0
,
repeat
):
trt_outputs
=
run_inference
(
context
,
bindings
=
bindings
,
inputs
=
inputs
,
outputs
=
outputs
,
stream
=
stream
)
time2
=
time
.
time
()
# total time cost(ms)
total_inference_cost
=
(
time2
-
time1
)
*
1000
print
(
"model path: "
,
onnx_model
,
" precision: "
,
args
.
precision
)
print
(
"In TensorRT, "
,
"average latency is : {} ms"
.
format
(
total_inference_cost
/
repeat
))
# Do postprocess
output
=
trt_outputs
[
0
]
predictions
=
np
.
reshape
(
output
,
(
1
,
-
1
,
int
(
5
+
num_class
)))[
0
]
dets
=
postprocess
(
predictions
,
ratio
)
# Draw rectangles and labels on the original image
if
dets
is
not
None
:
final_boxes
,
final_scores
,
final_cls_inds
=
dets
[:,
:
4
],
dets
[:,
4
],
dets
[:,
5
]
origin_img
=
draw_box
(
origin_img
,
final_boxes
,
final_scores
,
final_cls_inds
,
conf
=
0.5
,
class_names
=
CLASS_LABEL
)
cv2
.
imwrite
(
'output.jpg'
,
origin_img
)
print
(
'The prediction results are saved in output.jpg.'
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model_path'
,
type
=
str
,
default
=
"quant_model.onnx"
,
help
=
"inference model filepath"
)
parser
.
add_argument
(
'--image_file'
,
type
=
str
,
default
=
"bus.jpg"
,
help
=
"image path"
)
parser
.
add_argument
(
'--precision'
,
type
=
str
,
default
=
'fp32'
,
help
=
"support fp32/fp16/int8."
)
parser
.
add_argument
(
'--img_shape'
,
type
=
int
,
default
=
640
,
help
=
"input_size"
)
args
=
parser
.
parse_args
()
main
(
args
)
example/auto_compression/pytorch_yolov6/README.md
已删除
100644 → 0
浏览文件 @
407d47b3
# YOLOv6自动压缩示例
目录:
-
[
1.简介
](
#1简介
)
-
[
2.Benchmark
](
#2Benchmark
)
-
[
3.开始自动压缩
](
#自动压缩流程
)
-
[
3.1 环境准备
](
#31-准备环境
)
-
[
3.2 准备数据集
](
#32-准备数据集
)
-
[
3.3 准备预测模型
](
#33-准备预测模型
)
-
[
3.4 测试模型精度
](
#34-测试模型精度
)
-
[
3.5 自动压缩并产出模型
](
#35-自动压缩并产出模型
)
-
[
4.预测部署
](
#4预测部署
)
-
[
5.FAQ
](
5FAQ
)
## 1. 简介
飞桨模型转换工具
[
X2Paddle
](
https://github.com/PaddlePaddle/X2Paddle
)
支持将
```Caffe/TensorFlow/ONNX/PyTorch```
的模型一键转为飞桨(PaddlePaddle)的预测模型。借助X2Paddle的能力,各种框架的推理模型可以很方便的使用PaddleSlim的自动化压缩功能。
本示例将以
[
meituan/YOLOv6
](
https://github.com/meituan/YOLOv6
)
目标检测模型为例,将PyTorch框架模型转换为Paddle框架模型,再使用ACT自动压缩功能进行自动压缩。本示例使用的自动压缩策略为量化训练。
## 2.Benchmark
| 模型 | 策略 | 输入尺寸 | mAP
<sup>
val
<br>
0.5:0.95 | 预测时延
<sup><small>
FP32
</small><sup><br><sup>
(ms) |预测时延
<sup><small>
FP16
</small><sup><br><sup>
(ms) | 预测时延
<sup><small>
INT8
</small><sup><br><sup>
(ms) | 配置文件 | Inference模型 |
| :-------- |:-------- |:--------: | :---------------------: | :----------------: | :----------------: | :---------------: | :-----------------------------: | :-----------------------------: |
| YOLOv6s | Base模型 | 640
*
640 | 42.4 | 9.06ms | 2.90ms | - | - |
[
Model
](
https://paddle-slim-models.bj.bcebos.com/act/yolov6s.onnx
)
|
| YOLOv6s | KL离线量化 | 640
*
640 | 30.3 | - | - | 1.83ms | - | - |
| YOLOv6s | 量化蒸馏训练 | 640
*640 | **41.3** | - | - | **1.83ms*
*
|
[
config
](
./configs/yolov6s_qat_dis.yaml
)
|
[
Infer Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_quant.tar
)
|
[
ONNX Model
](
https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_quant.onnx
)
|
说明:
-
mAP的指标均在COCO val2017数据集中评测得到。
-
YOLOv6s模型在Tesla T4的GPU环境下开启TensorRT 8.4.1,batch_size=1, 测试脚本是
[
cpp_infer
](
./cpp_infer
)
。
## 3. 自动压缩流程
#### 3.1 准备环境
-
PaddlePaddle >= 2.3 (可从
[
Paddle官网
](
https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html
)
下载安装)
-
PaddleSlim > 2.3版本
-
PaddleDet >= 2.4
-
opencv-python
(1)安装paddlepaddle:
```
shell
# CPU
pip
install
paddlepaddle
# GPU
pip
install
paddlepaddle-gpu
```
(2)安装paddleslim:
```
shell
pip
install
paddleslim
```
(3)安装paddledet:
```
shell
pip
install
paddledet
```
注:安装PaddleDet的目的只是为了直接使用PaddleDetection中的Dataloader组件。
#### 3.2 准备数据集
本案例默认以COCO数据进行自动压缩实验,并且依赖PaddleDetection中数据读取模块,如果自定义COCO数据,或者其他格式数据,请参考
[
PaddleDetection数据准备文档
](
https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/docs/tutorials/PrepareDataSet.md
)
来准备数据。
如果已经准备好数据集,请直接修改[./configs/yolov6_reader.yml]中
`EvalDataset`
的
`dataset_dir`
字段为自己数据集路径即可。
#### 3.3 准备预测模型
(1)准备ONNX模型:
可通过
[
meituan/YOLOv6
](
https://github.com/meituan/YOLOv6
)
官方的
[
导出教程
](
https://github.com/meituan/YOLOv6/blob/main/deploy/ONNX/README.md
)
来准备ONNX模型。也可以下载已经准备好的
[
yolov6s.onnx
](
https://paddle-slim-models.bj.bcebos.com/act/yolov6s.onnx
)
。
#### 3.4 自动压缩并产出模型
蒸馏量化自动压缩示例通过run.py脚本启动,会使用接口
```paddleslim.auto_compression.AutoCompression```
对模型进行自动压缩。配置config文件中模型路径、蒸馏、量化、和训练等部分的参数,配置完成后便可对模型进行量化和蒸馏。具体运行命令为:
-
单卡训练:
```
export CUDA_VISIBLE_DEVICES=0
python run.py --config_path=./configs/yolov6s_qat_dis.yaml --save_dir='./output/'
```
-
多卡训练:
```
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 run.py \
--config_path=./configs/yolov6s_qat_dis.yaml --save_dir='./output/'
```
#### 3.5 测试模型精度
修改
[
yolov6s_qat_dis.yaml
](
./configs/yolov6s_qat_dis.yaml
)
中
`model_dir`
字段为模型存储路径,然后使用eval.py脚本得到模型的mAP:
```
export CUDA_VISIBLE_DEVICES=0
python eval.py --config_path=./configs/yolov6s_qat_dis.yaml
```
## 4.预测部署
#### 导出至ONNX使用TensorRT部署
-
首先安装Paddle2onnx:
```
shell
pip
install
paddle2onnx
==
1.0.0rc3
```
-
然后将量化模型导出至ONNX:
```
shell
paddle2onnx
--model_dir
output/
\
--model_filename
model.pdmodel
\
--params_filename
model.pdiparams
\
--opset_version
13
\
--enable_onnx_checker
True
\
--save_file
yolov6s_quant.onnx
\
--deploy_backend
tensorrt
```
-
进行测试:
```
shell
python yolov6_onnx_trt.py
--model_path
=
yolov6s_quant.onnx
--image_file
=
images/000000570688.jpg
--precision
=
int8
```
#### Paddle-TensorRT
-
C++部署:
进入
[
cpp_infer
](
./cpp_infer
)
文件夹内,请按照
[
C++ TensorRT Benchmark测试教程
](
./cpp_infer/README.md
)
进行准备环境及编译,然后开始测试:
```
shell
# 编译
bash complie.sh
# 执行
./build/trt_run
--model_file
yolov6s_quant/model.pdmodel
--params_file
yolov6s_quant/model.pdiparams
--run_mode
=
trt_int8
```
-
Python部署:
首先安装带有TensorRT的
[
Paddle安装包
](
https://www.paddlepaddle.org.cn/inference/v2.3/user_guides/download_lib.html#python
)
。
然后使用
[
paddle_trt_infer.py
](
./paddle_trt_infer.py
)
进行部署:
```
shell
python paddle_trt_infer.py
--model_path
=
output
--image_file
=
images/000000570688.jpg
--benchmark
=
True
--run_mode
=
trt_int8
```
## 5.FAQ
-
如果想测试离线量化模型精度,可执行:
```
shell
python post_quant.py
--config_path
=
./configs/yolov6s_qat_dis.yaml
```
example/auto_compression/pytorch_yolov6/cpp_infer/CMakeLists.txt
已删除
100644 → 0
浏览文件 @
407d47b3
cmake_minimum_required
(
VERSION 3.0
)
project
(
cpp_inference_demo CXX C
)
option
(
WITH_MKL
"Compile demo with MKL/OpenBlas support, default use MKL."
ON
)
option
(
WITH_GPU
"Compile demo with GPU/CPU, default use CPU."
OFF
)
option
(
WITH_STATIC_LIB
"Compile demo with static/shared library, default use static."
ON
)
option
(
USE_TENSORRT
"Compile demo with TensorRT."
OFF
)
option
(
WITH_ROCM
"Compile demo with rocm."
OFF
)
option
(
WITH_ONNXRUNTIME
"Compile demo with ONNXRuntime"
OFF
)
option
(
WITH_ARM
"Compile demo with ARM"
OFF
)
option
(
WITH_MIPS
"Compile demo with MIPS"
OFF
)
option
(
WITH_SW
"Compile demo with SW"
OFF
)
option
(
WITH_XPU
"Compile demow ith xpu"
OFF
)
option
(
WITH_NPU
"Compile demow ith npu"
OFF
)
if
(
NOT WITH_STATIC_LIB
)
add_definitions
(
"-DPADDLE_WITH_SHARED_LIB"
)
else
()
# PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode.
# Set it to empty in static library mode to avoid compilation issues.
add_definitions
(
"/DPD_INFER_DECL="
)
endif
()
macro
(
safe_set_static_flag
)
foreach
(
flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
)
if
(
${
flag_var
}
MATCHES
"/MD"
)
string
(
REGEX REPLACE
"/MD"
"/MT"
${
flag_var
}
"
${${
flag_var
}}
"
)
endif
(
${
flag_var
}
MATCHES
"/MD"
)
endforeach
(
flag_var
)
endmacro
()
if
(
NOT DEFINED PADDLE_LIB
)
message
(
FATAL_ERROR
"please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib"
)
endif
()
if
(
NOT DEFINED DEMO_NAME
)
message
(
FATAL_ERROR
"please set DEMO_NAME with -DDEMO_NAME=demo_name"
)
endif
()
include_directories
(
"
${
PADDLE_LIB
}
/"
)
set
(
PADDLE_LIB_THIRD_PARTY_PATH
"
${
PADDLE_LIB
}
/third_party/install/"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
protobuf/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
glog/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
gflags/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
xxhash/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
cryptopp/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
paddle2onnx/include"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
protobuf/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
glog/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
gflags/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
xxhash/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
cryptopp/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/paddle/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
paddle2onnx/lib"
)
if
(
WIN32
)
add_definitions
(
"/DGOOGLE_GLOG_DLL_DECL="
)
option
(
MSVC_STATIC_CRT
"use static C Runtime library by default"
ON
)
if
(
MSVC_STATIC_CRT
)
if
(
WITH_MKL
)
set
(
FLAG_OPENMP
"/openmp"
)
endif
()
set
(
CMAKE_C_FLAGS_DEBUG
"
${
CMAKE_C_FLAGS_DEBUG
}
/bigobj /MTd
${
FLAG_OPENMP
}
"
)
set
(
CMAKE_C_FLAGS_RELEASE
"
${
CMAKE_C_FLAGS_RELEASE
}
/bigobj /MT
${
FLAG_OPENMP
}
"
)
set
(
CMAKE_CXX_FLAGS_DEBUG
"
${
CMAKE_CXX_FLAGS_DEBUG
}
/bigobj /MTd
${
FLAG_OPENMP
}
"
)
set
(
CMAKE_CXX_FLAGS_RELEASE
"
${
CMAKE_CXX_FLAGS_RELEASE
}
/bigobj /MT
${
FLAG_OPENMP
}
"
)
safe_set_static_flag
()
if
(
WITH_STATIC_LIB
)
add_definitions
(
-DSTATIC_LIB
)
endif
()
endif
()
else
()
if
(
WITH_MKL
)
set
(
FLAG_OPENMP
"-fopenmp"
)
endif
()
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-std=c++11
${
FLAG_OPENMP
}
"
)
endif
()
if
(
WITH_GPU
)
if
(
NOT WIN32
)
include_directories
(
"/usr/local/cuda/include"
)
if
(
CUDA_LIB STREQUAL
""
)
set
(
CUDA_LIB
"/usr/local/cuda/lib64/"
CACHE STRING
"CUDA Library"
)
endif
()
else
()
include_directories
(
"C:
\\
Program
\
Files
\\
NVIDIA GPU Computing Toolkit
\\
CUDA
\\
v8.0
\\
include"
)
if
(
CUDA_LIB STREQUAL
""
)
set
(
CUDA_LIB
"C:
\\
Program
\
Files
\\
NVIDIA GPU Computing Toolkit
\\
CUDA
\\
v8.0
\\
lib
\\
x64"
)
endif
()
endif
(
NOT WIN32
)
endif
()
if
(
USE_TENSORRT AND WITH_GPU
)
set
(
TENSORRT_ROOT
""
CACHE STRING
"The root directory of TensorRT library"
)
if
(
"
${
TENSORRT_ROOT
}
"
STREQUAL
""
)
message
(
FATAL_ERROR
"The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH "
)
endif
()
set
(
TENSORRT_INCLUDE_DIR
${
TENSORRT_ROOT
}
/include
)
set
(
TENSORRT_LIB_DIR
${
TENSORRT_ROOT
}
/lib
)
file
(
READ
${
TENSORRT_INCLUDE_DIR
}
/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS
)
string
(
REGEX MATCH
"define NV_TENSORRT_MAJOR +([0-9]+)"
TENSORRT_MAJOR_VERSION
"
${
TENSORRT_VERSION_FILE_CONTENTS
}
"
)
if
(
"
${
TENSORRT_MAJOR_VERSION
}
"
STREQUAL
""
)
file
(
READ
${
TENSORRT_INCLUDE_DIR
}
/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS
)
string
(
REGEX MATCH
"define NV_TENSORRT_MAJOR +([0-9]+)"
TENSORRT_MAJOR_VERSION
"
${
TENSORRT_VERSION_FILE_CONTENTS
}
"
)
endif
()
if
(
"
${
TENSORRT_MAJOR_VERSION
}
"
STREQUAL
""
)
message
(
SEND_ERROR
"Failed to detect TensorRT version."
)
endif
()
string
(
REGEX REPLACE
"define NV_TENSORRT_MAJOR +([0-9]+)"
"
\\
1"
TENSORRT_MAJOR_VERSION
"
${
TENSORRT_MAJOR_VERSION
}
"
)
message
(
STATUS
"Current TensorRT header is
${
TENSORRT_INCLUDE_DIR
}
/NvInfer.h. "
"Current TensorRT version is v
${
TENSORRT_MAJOR_VERSION
}
. "
)
include_directories
(
"
${
TENSORRT_INCLUDE_DIR
}
"
)
link_directories
(
"
${
TENSORRT_LIB_DIR
}
"
)
endif
()
if
(
WITH_MKL
)
set
(
MATH_LIB_PATH
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
mklml"
)
include_directories
(
"
${
MATH_LIB_PATH
}
/include"
)
if
(
WIN32
)
set
(
MATH_LIB
${
MATH_LIB_PATH
}
/lib/mklml
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
${
MATH_LIB_PATH
}
/lib/libiomp5md
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
else
()
set
(
MATH_LIB
${
MATH_LIB_PATH
}
/lib/libmklml_intel
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
MATH_LIB_PATH
}
/lib/libiomp5
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
set
(
MKLDNN_PATH
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
mkldnn"
)
if
(
EXISTS
${
MKLDNN_PATH
}
)
include_directories
(
"
${
MKLDNN_PATH
}
/include"
)
if
(
WIN32
)
set
(
MKLDNN_LIB
${
MKLDNN_PATH
}
/lib/mkldnn.lib
)
else
(
WIN32
)
set
(
MKLDNN_LIB
${
MKLDNN_PATH
}
/lib/libmkldnn.so.0
)
endif
(
WIN32
)
endif
()
elseif
((
NOT WITH_MIPS
)
AND
(
NOT WITH_SW
))
set
(
OPENBLAS_LIB_PATH
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
openblas"
)
include_directories
(
"
${
OPENBLAS_LIB_PATH
}
/include/openblas"
)
if
(
WIN32
)
set
(
MATH_LIB
${
OPENBLAS_LIB_PATH
}
/lib/openblas
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
else
()
set
(
MATH_LIB
${
OPENBLAS_LIB_PATH
}
/lib/libopenblas
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
endif
()
endif
()
if
(
WITH_STATIC_LIB
)
set
(
DEPS
${
PADDLE_LIB
}
/paddle/lib/libpaddle_inference
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
else
()
if
(
WIN32
)
set
(
DEPS
${
PADDLE_LIB
}
/paddle/lib/paddle_inference
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
else
()
set
(
DEPS
${
PADDLE_LIB
}
/paddle/lib/libpaddle_inference
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
endif
()
if
(
WITH_ONNXRUNTIME
)
if
(
WIN32
)
set
(
DEPS
${
DEPS
}
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib/onnxruntime.lib paddle2onnx
)
elseif
(
APPLE
)
set
(
DEPS
${
DEPS
}
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx
)
else
()
set
(
DEPS
${
DEPS
}
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx
)
endif
()
endif
()
if
(
NOT WIN32
)
set
(
EXTERNAL_LIB
"-lrt -ldl -lpthread"
)
set
(
DEPS
${
DEPS
}
${
MATH_LIB
}
${
MKLDNN_LIB
}
glog gflags protobuf xxhash cryptopp
${
EXTERNAL_LIB
}
)
else
()
set
(
DEPS
${
DEPS
}
${
MATH_LIB
}
${
MKLDNN_LIB
}
glog gflags_static libprotobuf xxhash cryptopp-static
${
EXTERNAL_LIB
}
)
set
(
DEPS
${
DEPS
}
shlwapi.lib
)
endif
(
NOT WIN32
)
if
(
WITH_GPU
)
if
(
NOT WIN32
)
if
(
USE_TENSORRT
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/libnvinfer
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/libnvinfer_plugin
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/libcudart
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
else
()
if
(
USE_TENSORRT
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/nvinfer
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/nvinfer_plugin
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
if
(
${
TENSORRT_MAJOR_VERSION
}
GREATER_EQUAL 7
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/myelin64_1
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
endif
()
endif
()
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/cudart
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/cublas
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/cudnn
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
endif
()
endif
()
if
(
WITH_ROCM AND NOT WIN32
)
set
(
DEPS
${
DEPS
}
${
ROCM_LIB
}
/libamdhip64
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
if
(
WITH_XPU AND NOT WIN32
)
set
(
XPU_INSTALL_PATH
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
xpu"
)
set
(
DEPS
${
DEPS
}
${
XPU_INSTALL_PATH
}
/lib/libxpuapi
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
XPU_INSTALL_PATH
}
/lib/libxpurt
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
if
(
WITH_NPU AND NOT WIN32
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libgraph
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libge_runner
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libascendcl
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libascendcl
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libacl_op_compiler
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
add_executable
(
${
DEMO_NAME
}
${
DEMO_NAME
}
.cc
)
target_link_libraries
(
${
DEMO_NAME
}
${
DEPS
}
)
if
(
WIN32
)
if
(
USE_TENSORRT
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
TENSORRT_LIB_DIR
}
/nvinfer
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
TENSORRT_LIB_DIR
}
/nvinfer_plugin
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
)
if
(
${
TENSORRT_MAJOR_VERSION
}
GREATER_EQUAL 7
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
TENSORRT_LIB_DIR
}
/myelin64_1
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
)
endif
()
endif
()
if
(
WITH_MKL
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
MATH_LIB_PATH
}
/lib/mklml.dll
${
CMAKE_BINARY_DIR
}
/Release
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
MATH_LIB_PATH
}
/lib/libiomp5md.dll
${
CMAKE_BINARY_DIR
}
/Release
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
MKLDNN_PATH
}
/lib/mkldnn.dll
${
CMAKE_BINARY_DIR
}
/Release
)
else
()
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
OPENBLAS_LIB_PATH
}
/lib/openblas.dll
${
CMAKE_BINARY_DIR
}
/Release
)
endif
()
if
(
WITH_ONNXRUNTIME
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib/onnxruntime.dll
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
PADDLE_LIB_THIRD_PARTY_PATH
}
paddle2onnx/lib/paddle2onnx.dll
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
)
endif
()
if
(
NOT WITH_STATIC_LIB
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
"
${
PADDLE_LIB
}
/paddle/lib/paddle_inference.dll"
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
)
endif
()
endif
()
example/auto_compression/pytorch_yolov6/cpp_infer/README.md
已删除
100644 → 0
浏览文件 @
407d47b3
# YOLOv6 TensorRT Benchmark测试(Linux)
## 环境准备
-
CUDA、CUDNN:确认环境中已经安装CUDA和CUDNN,并且提前获取其安装路径。
-
TensorRT:可通过NVIDIA官网下载
[
TensorRT 8.4.1.5
](
https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.1/tars/tensorrt-8.4.1.5.linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz
)
或其他版本安装包。
-
Paddle Inference C++预测库:编译develop版本请参考
[
编译文档
](
https://www.paddlepaddle.org.cn/inference/user_guides/source_compile.html
)
。编译完成后,会在build目录下生成
`paddle_inference_install_dir`
文件夹,这个就是我们需要的C++预测库文件。
## 编译可执行程序
-
(1)修改
`compile.sh`
中依赖库路径,主要是以下内容:
```
shell
# Paddle Inference预测库路径
LIB_DIR
=
/root/auto_compress/Paddle/build/paddle_inference_install_dir/
# CUDNN路径
CUDNN_LIB
=
/usr/lib/x86_64-linux-gnu/
# CUDA路径
CUDA_LIB
=
/usr/local/cuda/lib64
# TensorRT安装包路径,为TRT资源包解压完成后的绝对路径,其中包含`lib`和`include`文件夹
TENSORRT_ROOT
=
/root/auto_compress/trt/trt8.4/
```
## 测试
-
FP32
```
./build/trt_run --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp32
```
-
FP16
```
./build/trt_run --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp16
```
-
INT8
```
./build/trt_run --model_file yolov6s_quant/model.pdmodel --params_file yolov6s_quant/model.pdiparams --run_mode=trt_int8
```
## 性能对比
| 模型 | 预测时延
<sup><small>
FP32
</small><sup><br><sup>
(ms) |预测时延
<sup><small>
FP16
</small><sup><br><sup>
(ms) | 预测时延
<sup><small>
INT8
</small><sup><br><sup>
(ms) |
| :-------- |:-------- |:--------: | :---------------------: |
| YOLOv6s | 9.06ms | 2.90ms | 1.83ms |
环境:
-
Tesla T4,TensorRT 8.4.1,CUDA 11.2
-
batch_size=1
example/auto_compression/pytorch_yolov6/cpp_infer/compile.sh
已删除
100644 → 0
浏览文件 @
407d47b3
#!/bin/bash
set
+x
set
-e
work_path
=
$(
dirname
$(
readlink
-f
$0
))
mkdir
-p
build
cd
build
rm
-rf
*
DEMO_NAME
=
trt_run
WITH_MKL
=
ON
WITH_GPU
=
ON
USE_TENSORRT
=
ON
LIB_DIR
=
/root/auto_compress/Paddle/build/paddle_inference_install_dir/
CUDNN_LIB
=
/usr/lib/x86_64-linux-gnu/
CUDA_LIB
=
/usr/local/cuda/lib64
TENSORRT_ROOT
=
/root/auto_compress/trt/trt8.4/
WITH_ROCM
=
OFF
ROCM_LIB
=
/opt/rocm/lib
cmake ..
-DPADDLE_LIB
=
${
LIB_DIR
}
\
-DWITH_MKL
=
${
WITH_MKL
}
\
-DDEMO_NAME
=
${
DEMO_NAME
}
\
-DWITH_GPU
=
${
WITH_GPU
}
\
-DWITH_STATIC_LIB
=
OFF
\
-DUSE_TENSORRT
=
${
USE_TENSORRT
}
\
-DWITH_ROCM
=
${
WITH_ROCM
}
\
-DROCM_LIB
=
${
ROCM_LIB
}
\
-DCUDNN_LIB
=
${
CUDNN_LIB
}
\
-DCUDA_LIB
=
${
CUDA_LIB
}
\
-DTENSORRT_ROOT
=
${
TENSORRT_ROOT
}
make
-j
example/auto_compression/pytorch_yolov6/dataset.py
已删除
100644 → 0
浏览文件 @
407d47b3
from
pycocotools.coco
import
COCO
import
cv2
import
os
import
numpy
as
np
import
paddle
class
COCOValDataset
(
paddle
.
io
.
Dataset
):
def
__init__
(
self
,
dataset_dir
=
None
,
image_dir
=
None
,
anno_path
=
None
,
img_size
=
[
640
,
640
]):
self
.
dataset_dir
=
dataset_dir
self
.
image_dir
=
image_dir
self
.
img_size
=
img_size
self
.
ann_file
=
os
.
path
.
join
(
dataset_dir
,
anno_path
)
self
.
coco
=
COCO
(
self
.
ann_file
)
ori_ids
=
list
(
sorted
(
self
.
coco
.
imgs
.
keys
()))
# check gt bbox
clean_ids
=
[]
for
idx
in
ori_ids
:
ins_anno_ids
=
self
.
coco
.
getAnnIds
(
imgIds
=
[
idx
],
iscrowd
=
False
)
instances
=
self
.
coco
.
loadAnns
(
ins_anno_ids
)
num_bbox
=
0
for
inst
in
instances
:
if
inst
.
get
(
'ignore'
,
False
):
continue
if
'bbox'
not
in
inst
.
keys
():
continue
elif
not
any
(
np
.
array
(
inst
[
'bbox'
])):
continue
else
:
num_bbox
+=
1
if
num_bbox
>
0
:
clean_ids
.
append
(
idx
)
self
.
ids
=
clean_ids
def
__getitem__
(
self
,
idx
):
img_id
=
self
.
ids
[
idx
]
img
=
self
.
_get_img_data_from_img_id
(
img_id
)
img
,
scale_factor
=
self
.
image_preprocess
(
img
,
self
.
img_size
)
return
{
'image'
:
img
,
'im_id'
:
np
.
array
([
img_id
]),
'scale_factor'
:
scale_factor
}
def
__len__
(
self
):
return
len
(
self
.
ids
)
def
_get_img_data_from_img_id
(
self
,
img_id
):
img_info
=
self
.
coco
.
loadImgs
(
img_id
)[
0
]
img_path
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
image_dir
,
img_info
[
'file_name'
])
img
=
cv2
.
imread
(
img_path
)
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
)
return
img
def
_generate_scale
(
self
,
im
,
target_shape
,
keep_ratio
=
True
):
"""
Args:
im (np.ndarray): image (np.ndarray)
Returns:
im_scale_x: the resize ratio of X
im_scale_y: the resize ratio of Y
"""
origin_shape
=
im
.
shape
[:
2
]
if
keep_ratio
:
im_size_min
=
np
.
min
(
origin_shape
)
im_size_max
=
np
.
max
(
origin_shape
)
target_size_min
=
np
.
min
(
target_shape
)
target_size_max
=
np
.
max
(
target_shape
)
im_scale
=
float
(
target_size_min
)
/
float
(
im_size_min
)
if
np
.
round
(
im_scale
*
im_size_max
)
>
target_size_max
:
im_scale
=
float
(
target_size_max
)
/
float
(
im_size_max
)
im_scale_x
=
im_scale
im_scale_y
=
im_scale
else
:
resize_h
,
resize_w
=
target_shape
im_scale_y
=
resize_h
/
float
(
origin_shape
[
0
])
im_scale_x
=
resize_w
/
float
(
origin_shape
[
1
])
return
im_scale_y
,
im_scale_x
def
image_preprocess
(
self
,
img
,
target_shape
):
# Resize image
im_scale_y
,
im_scale_x
=
self
.
_generate_scale
(
img
,
target_shape
)
img
=
cv2
.
resize
(
img
,
None
,
None
,
fx
=
im_scale_x
,
fy
=
im_scale_y
,
interpolation
=
cv2
.
INTER_LINEAR
)
# Pad
im_h
,
im_w
=
img
.
shape
[:
2
]
h
,
w
=
target_shape
[:]
if
h
!=
im_h
or
w
!=
im_w
:
canvas
=
np
.
ones
((
h
,
w
,
3
),
dtype
=
np
.
float32
)
canvas
*=
np
.
array
([
114.0
,
114.0
,
114.0
],
dtype
=
np
.
float32
)
canvas
[
0
:
im_h
,
0
:
im_w
,
:]
=
img
.
astype
(
np
.
float32
)
img
=
canvas
img
=
np
.
transpose
(
img
/
255
,
[
2
,
0
,
1
])
scale_factor
=
np
.
array
([
im_scale_y
,
im_scale_x
])
return
img
.
astype
(
np
.
float32
),
scale_factor
class
COCOTrainDataset
(
COCOValDataset
):
def
__getitem__
(
self
,
idx
):
img_id
=
self
.
ids
[
idx
]
img
=
self
.
_get_img_data_from_img_id
(
img_id
)
img
,
scale_factor
=
self
.
image_preprocess
(
img
,
self
.
img_size
)
return
{
'x2paddle_image_arrays'
:
img
}
example/auto_compression/pytorch_yolov6/eval.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
numpy
as
np
import
argparse
from
tqdm
import
tqdm
import
paddle
from
paddleslim.common
import
load_config
as
load_slim_config
from
paddleslim.common
import
load_onnx_model
from
post_process
import
YOLOv6PostProcess
,
coco_metric
from
dataset
import
COCOValDataset
def
argsparser
():
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
None
,
help
=
"path of compression strategy config."
,
required
=
True
)
parser
.
add_argument
(
'--devices'
,
type
=
str
,
default
=
'gpu'
,
help
=
"which device used to compress."
)
return
parser
def
eval
():
place
=
paddle
.
CUDAPlace
(
0
)
if
FLAGS
.
devices
==
'gpu'
else
paddle
.
CPUPlace
()
exe
=
paddle
.
static
.
Executor
(
place
)
val_program
,
feed_target_names
,
fetch_targets
=
load_onnx_model
(
global_config
[
"model_dir"
])
bboxes_list
,
bbox_nums_list
,
image_id_list
=
[],
[],
[]
with
tqdm
(
total
=
len
(
val_loader
),
bar_format
=
'Evaluation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}'
,
ncols
=
80
)
as
t
:
for
data
in
val_loader
:
data_all
=
{
k
:
np
.
array
(
v
)
for
k
,
v
in
data
.
items
()}
outs
=
exe
.
run
(
val_program
,
feed
=
{
feed_target_names
[
0
]:
data_all
[
'image'
]},
fetch_list
=
fetch_targets
,
return_numpy
=
False
)
res
=
{}
postprocess
=
YOLOv6PostProcess
(
score_threshold
=
0.001
,
nms_threshold
=
0.65
,
multi_label
=
True
)
res
=
postprocess
(
np
.
array
(
outs
[
0
]),
data_all
[
'scale_factor'
])
bboxes_list
.
append
(
res
[
'bbox'
])
bbox_nums_list
.
append
(
res
[
'bbox_num'
])
image_id_list
.
append
(
np
.
array
(
data_all
[
'im_id'
]))
t
.
update
()
coco_metric
(
anno_file
,
bboxes_list
,
bbox_nums_list
,
image_id_list
)
def
main
():
global
global_config
all_config
=
load_slim_config
(
FLAGS
.
config_path
)
global_config
=
all_config
[
"Global"
]
global
val_loader
dataset
=
COCOValDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'val_image_dir'
],
anno_path
=
global_config
[
'val_anno_path'
])
global
anno_file
anno_file
=
dataset
.
ann_file
val_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
)
eval
()
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
parser
=
argsparser
()
FLAGS
=
parser
.
parse_args
()
assert
FLAGS
.
devices
in
[
'cpu'
,
'gpu'
,
'xpu'
,
'npu'
]
paddle
.
set_device
(
FLAGS
.
devices
)
main
()
example/auto_compression/pytorch_yolov6/images/000000570688.jpg
已删除
100644 → 0
浏览文件 @
407d47b3
135.1 KB
example/auto_compression/pytorch_yolov6/paddle_trt_infer.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
cv2
import
numpy
as
np
import
argparse
import
time
from
paddle.inference
import
Config
from
paddle.inference
import
create_predictor
from
post_process
import
YOLOv6PostProcess
CLASS_LABEL
=
[
'person'
,
'bicycle'
,
'car'
,
'motorcycle'
,
'airplane'
,
'bus'
,
'train'
,
'truck'
,
'boat'
,
'traffic light'
,
'fire hydrant'
,
'stop sign'
,
'parking meter'
,
'bench'
,
'bird'
,
'cat'
,
'dog'
,
'horse'
,
'sheep'
,
'cow'
,
'elephant'
,
'bear'
,
'zebra'
,
'giraffe'
,
'backpack'
,
'umbrella'
,
'handbag'
,
'tie'
,
'suitcase'
,
'frisbee'
,
'skis'
,
'snowboard'
,
'sports ball'
,
'kite'
,
'baseball bat'
,
'baseball glove'
,
'skateboard'
,
'surfboard'
,
'tennis racket'
,
'bottle'
,
'wine glass'
,
'cup'
,
'fork'
,
'knife'
,
'spoon'
,
'bowl'
,
'banana'
,
'apple'
,
'sandwich'
,
'orange'
,
'broccoli'
,
'carrot'
,
'hot dog'
,
'pizza'
,
'donut'
,
'cake'
,
'chair'
,
'couch'
,
'potted plant'
,
'bed'
,
'dining table'
,
'toilet'
,
'tv'
,
'laptop'
,
'mouse'
,
'remote'
,
'keyboard'
,
'cell phone'
,
'microwave'
,
'oven'
,
'toaster'
,
'sink'
,
'refrigerator'
,
'book'
,
'clock'
,
'vase'
,
'scissors'
,
'teddy bear'
,
'hair drier'
,
'toothbrush'
]
def
generate_scale
(
im
,
target_shape
,
keep_ratio
=
True
):
"""
Args:
im (np.ndarray): image (np.ndarray)
Returns:
im_scale_x: the resize ratio of X
im_scale_y: the resize ratio of Y
"""
origin_shape
=
im
.
shape
[:
2
]
if
keep_ratio
:
im_size_min
=
np
.
min
(
origin_shape
)
im_size_max
=
np
.
max
(
origin_shape
)
target_size_min
=
np
.
min
(
target_shape
)
target_size_max
=
np
.
max
(
target_shape
)
im_scale
=
float
(
target_size_min
)
/
float
(
im_size_min
)
if
np
.
round
(
im_scale
*
im_size_max
)
>
target_size_max
:
im_scale
=
float
(
target_size_max
)
/
float
(
im_size_max
)
im_scale_x
=
im_scale
im_scale_y
=
im_scale
else
:
resize_h
,
resize_w
=
target_shape
im_scale_y
=
resize_h
/
float
(
origin_shape
[
0
])
im_scale_x
=
resize_w
/
float
(
origin_shape
[
1
])
return
im_scale_y
,
im_scale_x
def
image_preprocess
(
img_path
,
target_shape
):
img
=
cv2
.
imread
(
img_path
)
# Resize
im_scale_y
,
im_scale_x
=
generate_scale
(
img
,
target_shape
)
img
=
cv2
.
resize
(
img
,
None
,
None
,
fx
=
im_scale_x
,
fy
=
im_scale_y
,
interpolation
=
cv2
.
INTER_LINEAR
)
# Pad
im_h
,
im_w
=
img
.
shape
[:
2
]
h
,
w
=
target_shape
[:]
if
h
!=
im_h
or
w
!=
im_w
:
canvas
=
np
.
ones
((
h
,
w
,
3
),
dtype
=
np
.
float32
)
canvas
*=
np
.
array
([
114.0
,
114.0
,
114.0
],
dtype
=
np
.
float32
)
canvas
[
0
:
im_h
,
0
:
im_w
,
:]
=
img
.
astype
(
np
.
float32
)
img
=
canvas
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
)
img
=
np
.
transpose
(
img
,
[
2
,
0
,
1
])
/
255
img
=
np
.
expand_dims
(
img
,
0
)
scale_factor
=
np
.
array
([[
im_scale_y
,
im_scale_x
]])
return
img
.
astype
(
np
.
float32
),
scale_factor
def
get_color_map_list
(
num_classes
):
color_map
=
num_classes
*
[
0
,
0
,
0
]
for
i
in
range
(
0
,
num_classes
):
j
=
0
lab
=
i
while
lab
:
color_map
[
i
*
3
]
|=
(((
lab
>>
0
)
&
1
)
<<
(
7
-
j
))
color_map
[
i
*
3
+
1
]
|=
(((
lab
>>
1
)
&
1
)
<<
(
7
-
j
))
color_map
[
i
*
3
+
2
]
|=
(((
lab
>>
2
)
&
1
)
<<
(
7
-
j
))
j
+=
1
lab
>>=
3
color_map
=
[
color_map
[
i
:
i
+
3
]
for
i
in
range
(
0
,
len
(
color_map
),
3
)]
return
color_map
def
draw_box
(
image_file
,
results
,
class_label
,
threshold
=
0.5
):
srcimg
=
cv2
.
imread
(
image_file
,
1
)
for
i
in
range
(
len
(
results
)):
color_list
=
get_color_map_list
(
len
(
class_label
))
clsid2color
=
{}
classid
,
conf
=
int
(
results
[
i
,
0
]),
results
[
i
,
1
]
if
conf
<
threshold
:
continue
xmin
,
ymin
,
xmax
,
ymax
=
int
(
results
[
i
,
2
]),
int
(
results
[
i
,
3
]),
int
(
results
[
i
,
4
]),
int
(
results
[
i
,
5
])
if
classid
not
in
clsid2color
:
clsid2color
[
classid
]
=
color_list
[
classid
]
color
=
tuple
(
clsid2color
[
classid
])
cv2
.
rectangle
(
srcimg
,
(
xmin
,
ymin
),
(
xmax
,
ymax
),
color
,
thickness
=
2
)
print
(
class_label
[
classid
]
+
': '
+
str
(
round
(
conf
,
3
)))
cv2
.
putText
(
srcimg
,
class_label
[
classid
]
+
':'
+
str
(
round
(
conf
,
3
)),
(
xmin
,
ymin
-
10
),
cv2
.
FONT_HERSHEY_SIMPLEX
,
0.8
,
(
0
,
255
,
0
),
thickness
=
2
)
return
srcimg
def
load_predictor
(
model_dir
,
run_mode
=
'paddle'
,
batch_size
=
1
,
device
=
'CPU'
,
min_subgraph_size
=
3
,
use_dynamic_shape
=
False
,
trt_min_shape
=
1
,
trt_max_shape
=
1280
,
trt_opt_shape
=
640
,
trt_calib_mode
=
False
,
cpu_threads
=
1
,
enable_mkldnn
=
False
,
enable_mkldnn_bfloat16
=
False
,
delete_shuffle_pass
=
False
):
"""set AnalysisConfig, generate AnalysisPredictor
Args:
model_dir (str): root path of __model__ and __params__
device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
run_mode (str): mode of running(paddle/trt_fp32/trt_fp16/trt_int8)
use_dynamic_shape (bool): use dynamic shape or not
trt_min_shape (int): min shape for dynamic shape in trt
trt_max_shape (int): max shape for dynamic shape in trt
trt_opt_shape (int): opt shape for dynamic shape in trt
trt_calib_mode (bool): If the model is produced by TRT offline quantitative
calibration, trt_calib_mode need to set True
delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT.
Used by action model.
Returns:
predictor (PaddlePredictor): AnalysisPredictor
Raises:
ValueError: predict by TensorRT need device == 'GPU'.
"""
if
device
!=
'GPU'
and
run_mode
!=
'paddle'
:
raise
ValueError
(
"Predict by TensorRT mode: {}, expect device=='GPU', but device == {}"
.
format
(
run_mode
,
device
))
config
=
Config
(
os
.
path
.
join
(
model_dir
,
'model.pdmodel'
),
os
.
path
.
join
(
model_dir
,
'model.pdiparams'
))
if
device
==
'GPU'
:
# initial GPU memory(M), device ID
config
.
enable_use_gpu
(
200
,
0
)
# optimize graph and fuse op
config
.
switch_ir_optim
(
True
)
elif
device
==
'XPU'
:
config
.
enable_lite_engine
()
config
.
enable_xpu
(
10
*
1024
*
1024
)
else
:
config
.
disable_gpu
()
config
.
set_cpu_math_library_num_threads
(
cpu_threads
)
if
enable_mkldnn
:
try
:
# cache 10 different shapes for mkldnn to avoid memory leak
config
.
set_mkldnn_cache_capacity
(
10
)
config
.
enable_mkldnn
()
if
enable_mkldnn_bfloat16
:
config
.
enable_mkldnn_bfloat16
()
except
Exception
as
e
:
print
(
"The current environment does not support `mkldnn`, so disable mkldnn."
)
pass
precision_map
=
{
'trt_int8'
:
Config
.
Precision
.
Int8
,
'trt_fp32'
:
Config
.
Precision
.
Float32
,
'trt_fp16'
:
Config
.
Precision
.
Half
}
if
run_mode
in
precision_map
.
keys
():
config
.
enable_tensorrt_engine
(
workspace_size
=
(
1
<<
25
)
*
batch_size
,
max_batch_size
=
batch_size
,
min_subgraph_size
=
min_subgraph_size
,
precision_mode
=
precision_map
[
run_mode
],
use_static
=
False
,
use_calib_mode
=
trt_calib_mode
)
if
use_dynamic_shape
:
min_input_shape
=
{
'image'
:
[
batch_size
,
3
,
trt_min_shape
,
trt_min_shape
]
}
max_input_shape
=
{
'image'
:
[
batch_size
,
3
,
trt_max_shape
,
trt_max_shape
]
}
opt_input_shape
=
{
'image'
:
[
batch_size
,
3
,
trt_opt_shape
,
trt_opt_shape
]
}
config
.
set_trt_dynamic_shape_info
(
min_input_shape
,
max_input_shape
,
opt_input_shape
)
print
(
'trt set dynamic shape done!'
)
# disable print log when predict
config
.
disable_glog_info
()
# enable shared memory
config
.
enable_memory_optim
()
# disable feed, fetch OP, needed by zero_copy_run
config
.
switch_use_feed_fetch_ops
(
False
)
if
delete_shuffle_pass
:
config
.
delete_pass
(
"shuffle_channel_detect_pass"
)
predictor
=
create_predictor
(
config
)
return
predictor
def
predict_image
(
predictor
,
image_file
,
image_shape
=
[
640
,
640
],
warmup
=
1
,
repeats
=
1
,
threshold
=
0.5
):
img
,
scale_factor
=
image_preprocess
(
image_file
,
image_shape
)
inputs
[
'x2paddle_image_arrays'
]
=
img
input_names
=
predictor
.
get_input_names
()
for
i
in
range
(
len
(
input_names
)):
input_tensor
=
predictor
.
get_input_handle
(
input_names
[
i
])
input_tensor
.
copy_from_cpu
(
inputs
[
input_names
[
i
]])
for
i
in
range
(
warmup
):
predictor
.
run
()
np_boxes
=
None
predict_time
=
0.
time_min
=
float
(
"inf"
)
time_max
=
float
(
'-inf'
)
for
i
in
range
(
repeats
):
start_time
=
time
.
time
()
predictor
.
run
()
output_names
=
predictor
.
get_output_names
()
boxes_tensor
=
predictor
.
get_output_handle
(
output_names
[
0
])
np_boxes
=
boxes_tensor
.
copy_to_cpu
()
end_time
=
time
.
time
()
timed
=
end_time
-
start_time
time_min
=
min
(
time_min
,
timed
)
time_max
=
max
(
time_max
,
timed
)
predict_time
+=
timed
time_avg
=
predict_time
/
repeats
print
(
'Inference time(ms): min={}, max={}, avg={}'
.
format
(
round
(
time_min
*
1000
,
2
),
round
(
time_max
*
1000
,
1
),
round
(
time_avg
*
1000
,
1
)))
postprocess
=
YOLOv6PostProcess
(
score_threshold
=
0.001
,
nms_threshold
=
0.65
,
multi_label
=
True
)
res
=
postprocess
(
np_boxes
,
scale_factor
)
res_img
=
draw_box
(
image_file
,
res
[
'bbox'
],
CLASS_LABEL
,
threshold
=
threshold
)
cv2
.
imwrite
(
'result.jpg'
,
res_img
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--image_file'
,
type
=
str
,
default
=
None
,
help
=
"image path"
)
parser
.
add_argument
(
'--model_path'
,
type
=
str
,
help
=
"inference model filepath"
)
parser
.
add_argument
(
'--benchmark'
,
type
=
bool
,
default
=
False
,
help
=
"Whether run benchmark or not."
)
parser
.
add_argument
(
'--run_mode'
,
type
=
str
,
default
=
'paddle'
,
help
=
"mode of running(paddle/trt_fp32/trt_fp16/trt_int8)"
)
parser
.
add_argument
(
'--device'
,
type
=
str
,
default
=
'GPU'
,
help
=
"Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU"
)
parser
.
add_argument
(
'--img_shape'
,
type
=
int
,
default
=
640
,
help
=
"input_size"
)
args
=
parser
.
parse_args
()
predictor
=
load_predictor
(
args
.
model_path
,
run_mode
=
args
.
run_mode
,
device
=
args
.
device
)
warmup
,
repeats
=
1
,
1
if
args
.
benchmark
:
warmup
,
repeats
=
50
,
100
predict_image
(
predictor
,
args
.
image_file
,
image_shape
=
[
args
.
img_shape
,
args
.
img_shape
],
warmup
=
warmup
,
repeats
=
repeats
)
example/auto_compression/pytorch_yolov6/post_process.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
cv2
import
json
import
sys
def
box_area
(
boxes
):
"""
Args:
boxes(np.ndarray): [N, 4]
return: [N]
"""
return
(
boxes
[:,
2
]
-
boxes
[:,
0
])
*
(
boxes
[:,
3
]
-
boxes
[:,
1
])
def
box_iou
(
box1
,
box2
):
"""
Args:
box1(np.ndarray): [N, 4]
box2(np.ndarray): [M, 4]
return: [N, M]
"""
area1
=
box_area
(
box1
)
area2
=
box_area
(
box2
)
lt
=
np
.
maximum
(
box1
[:,
np
.
newaxis
,
:
2
],
box2
[:,
:
2
])
rb
=
np
.
minimum
(
box1
[:,
np
.
newaxis
,
2
:],
box2
[:,
2
:])
wh
=
rb
-
lt
wh
=
np
.
maximum
(
0
,
wh
)
inter
=
wh
[:,
:,
0
]
*
wh
[:,
:,
1
]
iou
=
inter
/
(
area1
[:,
np
.
newaxis
]
+
area2
-
inter
)
return
iou
def
nms
(
boxes
,
scores
,
iou_threshold
):
"""
Non Max Suppression numpy implementation.
args:
boxes(np.ndarray): [N, 4]
scores(np.ndarray): [N, 1]
iou_threshold(float): Threshold of IoU.
"""
idxs
=
scores
.
argsort
()
keep
=
[]
while
idxs
.
size
>
0
:
max_score_index
=
idxs
[
-
1
]
max_score_box
=
boxes
[
max_score_index
][
None
,
:]
keep
.
append
(
max_score_index
)
if
idxs
.
size
==
1
:
break
idxs
=
idxs
[:
-
1
]
other_boxes
=
boxes
[
idxs
]
ious
=
box_iou
(
max_score_box
,
other_boxes
)
idxs
=
idxs
[
ious
[
0
]
<=
iou_threshold
]
keep
=
np
.
array
(
keep
)
return
keep
class
YOLOv6PostProcess
(
object
):
"""
Post process of YOLOv6 network.
args:
score_threshold(float): Threshold to filter out bounding boxes with low
confidence score. If not provided, consider all boxes.
nms_threshold(float): The threshold to be used in NMS.
multi_label(bool): Whether keep multi label in boxes.
keep_top_k(int): Number of total bboxes to be kept per image after NMS
step. -1 means keeping all bboxes after NMS step.
"""
def
__init__
(
self
,
score_threshold
=
0.25
,
nms_threshold
=
0.5
,
multi_label
=
False
,
keep_top_k
=
300
):
self
.
score_threshold
=
score_threshold
self
.
nms_threshold
=
nms_threshold
self
.
multi_label
=
multi_label
self
.
keep_top_k
=
keep_top_k
def
_xywh2xyxy
(
self
,
x
):
# Convert from [x, y, w, h] to [x1, y1, x2, y2]
y
=
np
.
copy
(
x
)
y
[:,
0
]
=
x
[:,
0
]
-
x
[:,
2
]
/
2
# top left x
y
[:,
1
]
=
x
[:,
1
]
-
x
[:,
3
]
/
2
# top left y
y
[:,
2
]
=
x
[:,
0
]
+
x
[:,
2
]
/
2
# bottom right x
y
[:,
3
]
=
x
[:,
1
]
+
x
[:,
3
]
/
2
# bottom right y
return
y
def
_non_max_suppression
(
self
,
prediction
):
max_wh
=
4096
# (pixels) minimum and maximum box width and height
nms_top_k
=
30000
cand_boxes
=
prediction
[...,
4
]
>
self
.
score_threshold
# candidates
output
=
[
np
.
zeros
((
0
,
6
))]
*
prediction
.
shape
[
0
]
for
batch_id
,
boxes
in
enumerate
(
prediction
):
# Apply constraints
boxes
=
boxes
[
cand_boxes
[
batch_id
]]
if
not
boxes
.
shape
[
0
]:
continue
# Compute conf (conf = obj_conf * cls_conf)
boxes
[:,
5
:]
*=
boxes
[:,
4
:
5
]
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
convert_box
=
self
.
_xywh2xyxy
(
boxes
[:,
:
4
])
# Detections matrix nx6 (xyxy, conf, cls)
if
self
.
multi_label
:
i
,
j
=
(
boxes
[:,
5
:]
>
self
.
score_threshold
).
nonzero
()
boxes
=
np
.
concatenate
(
(
convert_box
[
i
],
boxes
[
i
,
j
+
5
,
None
],
j
[:,
None
].
astype
(
np
.
float32
)),
axis
=
1
)
else
:
conf
=
np
.
max
(
boxes
[:,
5
:],
axis
=
1
)
j
=
np
.
argmax
(
boxes
[:,
5
:],
axis
=
1
)
re
=
np
.
array
(
conf
.
reshape
(
-
1
)
>
self
.
score_threshold
)
conf
=
conf
.
reshape
(
-
1
,
1
)
j
=
j
.
reshape
(
-
1
,
1
)
boxes
=
np
.
concatenate
((
convert_box
,
conf
,
j
),
axis
=
1
)[
re
]
num_box
=
boxes
.
shape
[
0
]
if
not
num_box
:
continue
elif
num_box
>
nms_top_k
:
boxes
=
boxes
[
boxes
[:,
4
].
argsort
()[::
-
1
][:
nms_top_k
]]
# Batched NMS
c
=
boxes
[:,
5
:
6
]
*
max_wh
clean_boxes
,
scores
=
boxes
[:,
:
4
]
+
c
,
boxes
[:,
4
]
keep
=
nms
(
clean_boxes
,
scores
,
self
.
nms_threshold
)
# limit detection box num
if
keep
.
shape
[
0
]
>
self
.
keep_top_k
:
keep
=
keep
[:
self
.
keep_top_k
]
output
[
batch_id
]
=
boxes
[
keep
]
return
output
def
__call__
(
self
,
outs
,
scale_factor
):
preds
=
self
.
_non_max_suppression
(
outs
)
bboxs
,
box_nums
=
[],
[]
for
i
,
pred
in
enumerate
(
preds
):
if
len
(
pred
.
shape
)
>
2
:
pred
=
np
.
squeeze
(
pred
)
if
len
(
pred
.
shape
)
==
1
:
pred
=
pred
[
np
.
newaxis
,
:]
pred_bboxes
=
pred
[:,
:
4
]
scale_factor
=
np
.
tile
(
scale_factor
[
i
][::
-
1
],
(
1
,
2
))
pred_bboxes
/=
scale_factor
bbox
=
np
.
concatenate
(
[
pred
[:,
-
1
][:,
np
.
newaxis
],
pred
[:,
-
2
][:,
np
.
newaxis
],
pred_bboxes
],
axis
=-
1
)
bboxs
.
append
(
bbox
)
box_num
=
bbox
.
shape
[
0
]
box_nums
.
append
(
box_num
)
bboxs
=
np
.
concatenate
(
bboxs
,
axis
=
0
)
box_nums
=
np
.
array
(
box_nums
)
return
{
'bbox'
:
bboxs
,
'bbox_num'
:
box_nums
}
def
coco_metric
(
anno_file
,
bboxes_list
,
bbox_nums_list
,
image_id_list
):
try
:
from
pycocotools.coco
import
COCO
from
pycocotools.cocoeval
import
COCOeval
except
:
print
(
"[ERROR] Not found pycocotools, please install by `pip install pycocotools`"
)
sys
.
exit
(
1
)
coco_gt
=
COCO
(
anno_file
)
cats
=
coco_gt
.
loadCats
(
coco_gt
.
getCatIds
())
clsid2catid
=
{
i
:
cat
[
'id'
]
for
i
,
cat
in
enumerate
(
cats
)}
results
=
[]
for
bboxes
,
bbox_nums
,
image_id
in
zip
(
bboxes_list
,
bbox_nums_list
,
image_id_list
):
results
+=
_get_det_res
(
bboxes
,
bbox_nums
,
image_id
,
clsid2catid
)
output
=
"bbox.json"
with
open
(
output
,
'w'
)
as
f
:
json
.
dump
(
results
,
f
)
coco_dt
=
coco_gt
.
loadRes
(
output
)
coco_eval
=
COCOeval
(
coco_gt
,
coco_dt
,
'bbox'
)
coco_eval
.
evaluate
()
coco_eval
.
accumulate
()
coco_eval
.
summarize
()
return
coco_eval
.
stats
def
_get_det_res
(
bboxes
,
bbox_nums
,
image_id
,
label_to_cat_id_map
):
det_res
=
[]
k
=
0
for
i
in
range
(
len
(
bbox_nums
)):
cur_image_id
=
int
(
image_id
[
i
][
0
])
det_nums
=
bbox_nums
[
i
]
for
j
in
range
(
det_nums
):
dt
=
bboxes
[
k
]
k
=
k
+
1
num_id
,
score
,
xmin
,
ymin
,
xmax
,
ymax
=
dt
.
tolist
()
if
int
(
num_id
)
<
0
:
continue
category_id
=
label_to_cat_id_map
[
int
(
num_id
)]
w
=
xmax
-
xmin
h
=
ymax
-
ymin
bbox
=
[
xmin
,
ymin
,
w
,
h
]
dt_res
=
{
'image_id'
:
cur_image_id
,
'category_id'
:
category_id
,
'bbox'
:
bbox
,
'score'
:
score
}
det_res
.
append
(
dt_res
)
return
det_res
example/auto_compression/pytorch_yolov6/post_quant.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
numpy
as
np
import
argparse
import
paddle
from
paddleslim.common
import
load_config
from
paddleslim.common
import
load_onnx_model
from
paddleslim.quant
import
quant_post_static
from
dataset
import
COCOTrainDataset
def
argsparser
():
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
None
,
help
=
"path of compression strategy config."
,
required
=
True
)
parser
.
add_argument
(
'--save_dir'
,
type
=
str
,
default
=
'ptq_out'
,
help
=
"directory to save compressed model."
)
parser
.
add_argument
(
'--devices'
,
type
=
str
,
default
=
'gpu'
,
help
=
"which device used to compress."
)
parser
.
add_argument
(
'--algo'
,
type
=
str
,
default
=
'KL'
,
help
=
"post quant algo."
)
return
parser
def
main
():
global
global_config
all_config
=
load_config
(
FLAGS
.
config_path
)
global_config
=
all_config
[
"Global"
]
dataset
=
COCOTrainDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'val_image_dir'
],
anno_path
=
global_config
[
'val_anno_path'
])
train_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
0
)
place
=
paddle
.
CUDAPlace
(
0
)
if
FLAGS
.
devices
==
'gpu'
else
paddle
.
CPUPlace
()
exe
=
paddle
.
static
.
Executor
(
place
)
load_onnx_model
(
global_config
[
"model_dir"
])
inference_model_path
=
global_config
[
"model_dir"
].
rstrip
().
rstrip
(
'.onnx'
)
+
'_infer'
quant_post_static
(
executor
=
exe
,
model_dir
=
inference_model_path
,
quantize_model_path
=
FLAGS
.
save_dir
,
data_loader
=
train_loader
,
model_filename
=
'model.pdmodel'
,
params_filename
=
'model.pdiparams'
,
batch_size
=
32
,
batch_nums
=
10
,
algo
=
FLAGS
.
algo
,
hist_percent
=
0.999
,
is_full_quantize
=
False
,
bias_correction
=
False
,
onnx_format
=
True
)
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
parser
=
argsparser
()
FLAGS
=
parser
.
parse_args
()
assert
FLAGS
.
devices
in
[
'cpu'
,
'gpu'
,
'xpu'
,
'npu'
]
paddle
.
set_device
(
FLAGS
.
devices
)
main
()
example/auto_compression/pytorch_yolov6/run.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
numpy
as
np
import
argparse
from
tqdm
import
tqdm
import
paddle
from
paddleslim.common
import
load_config
as
load_slim_config
from
paddleslim.auto_compression
import
AutoCompression
from
dataset
import
COCOValDataset
,
COCOTrainDataset
from
post_process
import
YOLOv6PostProcess
,
coco_metric
def
argsparser
():
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
None
,
help
=
"path of compression strategy config."
,
required
=
True
)
parser
.
add_argument
(
'--save_dir'
,
type
=
str
,
default
=
'output'
,
help
=
"directory to save compressed model."
)
parser
.
add_argument
(
'--devices'
,
type
=
str
,
default
=
'gpu'
,
help
=
"which device used to compress."
)
parser
.
add_argument
(
'--eval'
,
type
=
bool
,
default
=
False
,
help
=
"whether to run evaluation."
)
return
parser
def
eval_function
(
exe
,
compiled_test_program
,
test_feed_names
,
test_fetch_list
):
bboxes_list
,
bbox_nums_list
,
image_id_list
=
[],
[],
[]
with
tqdm
(
total
=
len
(
val_loader
),
bar_format
=
'Evaluation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}'
,
ncols
=
80
)
as
t
:
for
data
in
val_loader
:
data_all
=
{
k
:
np
.
array
(
v
)
for
k
,
v
in
data
.
items
()}
outs
=
exe
.
run
(
compiled_test_program
,
feed
=
{
test_feed_names
[
0
]:
data_all
[
'image'
]},
fetch_list
=
test_fetch_list
,
return_numpy
=
False
)
res
=
{}
postprocess
=
YOLOv6PostProcess
(
score_threshold
=
0.001
,
nms_threshold
=
0.65
,
multi_label
=
True
)
res
=
postprocess
(
np
.
array
(
outs
[
0
]),
data_all
[
'scale_factor'
])
bboxes_list
.
append
(
res
[
'bbox'
])
bbox_nums_list
.
append
(
res
[
'bbox_num'
])
image_id_list
.
append
(
np
.
array
(
data_all
[
'im_id'
]))
t
.
update
()
map_res
=
coco_metric
(
anno_file
,
bboxes_list
,
bbox_nums_list
,
image_id_list
)
return
map_res
[
0
]
def
main
():
global
global_config
all_config
=
load_slim_config
(
FLAGS
.
config_path
)
assert
"Global"
in
all_config
,
"Key 'Global' not found in config file.
\n
{}"
.
format
(
all_config
)
global_config
=
all_config
[
"Global"
]
dataset
=
COCOTrainDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'train_image_dir'
],
anno_path
=
global_config
[
'train_anno_path'
])
train_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
0
)
if
'Evaluation'
in
global_config
.
keys
()
and
global_config
[
'Evaluation'
]
and
paddle
.
distributed
.
get_rank
()
==
0
:
eval_func
=
eval_function
global
val_loader
dataset
=
COCOValDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'val_image_dir'
],
anno_path
=
global_config
[
'val_anno_path'
])
global
anno_file
anno_file
=
dataset
.
ann_file
val_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
False
,
drop_last
=
False
,
num_workers
=
0
)
else
:
eval_func
=
None
ac
=
AutoCompression
(
model_dir
=
global_config
[
"model_dir"
],
train_dataloader
=
train_loader
,
save_dir
=
FLAGS
.
save_dir
,
config
=
all_config
,
eval_callback
=
eval_func
)
ac
.
compress
()
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
parser
=
argsparser
()
FLAGS
=
parser
.
parse_args
()
assert
FLAGS
.
devices
in
[
'cpu'
,
'gpu'
,
'xpu'
,
'npu'
]
paddle
.
set_device
(
FLAGS
.
devices
)
main
()
example/auto_compression/pytorch_yolov6/yolov6_onnx_trt.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
cv2
import
tensorrt
as
trt
import
pycuda.driver
as
cuda
import
pycuda.autoinit
import
os
import
time
import
random
import
argparse
EXPLICIT_BATCH
=
1
<<
(
int
)(
trt
.
NetworkDefinitionCreationFlag
.
EXPLICIT_BATCH
)
EXPLICIT_PRECISION
=
1
<<
(
int
)(
trt
.
NetworkDefinitionCreationFlag
.
EXPLICIT_PRECISION
)
# load coco labels
CLASS_LABEL
=
[
"person"
,
"bicycle"
,
"car"
,
"motorcycle"
,
"airplane"
,
"bus"
,
"train"
,
"truck"
,
"boat"
,
"traffic light"
,
"fire hydrant"
,
"stop sign"
,
"parking meter"
,
"bench"
,
"bird"
,
"cat"
,
"dog"
,
"horse"
,
"sheep"
,
"cow"
,
"elephant"
,
"bear"
,
"zebra"
,
"giraffe"
,
"backpack"
,
"umbrella"
,
"handbag"
,
"tie"
,
"suitcase"
,
"frisbee"
,
"skis"
,
"snowboard"
,
"sports ball"
,
"kite"
,
"baseball bat"
,
"baseball glove"
,
"skateboard"
,
"surfboard"
,
"tennis racket"
,
"bottle"
,
"wine glass"
,
"cup"
,
"fork"
,
"knife"
,
"spoon"
,
"bowl"
,
"banana"
,
"apple"
,
"sandwich"
,
"orange"
,
"broccoli"
,
"carrot"
,
"hot dog"
,
"pizza"
,
"donut"
,
"cake"
,
"chair"
,
"couch"
,
"potted plant"
,
"bed"
,
"dining table"
,
"toilet"
,
"tv"
,
"laptop"
,
"mouse"
,
"remote"
,
"keyboard"
,
"cell phone"
,
"microwave"
,
"oven"
,
"toaster"
,
"sink"
,
"refrigerator"
,
"book"
,
"clock"
,
"vase"
,
"scissors"
,
"teddy bear"
,
"hair drier"
,
"toothbrush"
]
def
preprocess
(
image
,
input_size
,
mean
=
None
,
std
=
None
,
swap
=
(
2
,
0
,
1
)):
if
len
(
image
.
shape
)
==
3
:
padded_img
=
np
.
ones
((
input_size
[
0
],
input_size
[
1
],
3
))
*
114.0
else
:
padded_img
=
np
.
ones
(
input_size
)
*
114.0
img
=
np
.
array
(
image
)
r
=
min
(
input_size
[
0
]
/
img
.
shape
[
0
],
input_size
[
1
]
/
img
.
shape
[
1
])
resized_img
=
cv2
.
resize
(
img
,
(
int
(
img
.
shape
[
1
]
*
r
),
int
(
img
.
shape
[
0
]
*
r
)),
interpolation
=
cv2
.
INTER_LINEAR
,
).
astype
(
np
.
float32
)
padded_img
[:
int
(
img
.
shape
[
0
]
*
r
),
:
int
(
img
.
shape
[
1
]
*
r
)]
=
resized_img
padded_img
=
padded_img
[:,
:,
::
-
1
]
padded_img
/=
255.0
if
mean
is
not
None
:
padded_img
-=
mean
if
std
is
not
None
:
padded_img
/=
std
padded_img
=
padded_img
.
transpose
(
swap
)
padded_img
=
np
.
ascontiguousarray
(
padded_img
,
dtype
=
np
.
float32
)
return
padded_img
,
r
def
postprocess
(
predictions
,
ratio
):
boxes
=
predictions
[:,
:
4
]
scores
=
predictions
[:,
4
:
5
]
*
predictions
[:,
5
:]
boxes_xyxy
=
np
.
ones_like
(
boxes
)
boxes_xyxy
[:,
0
]
=
boxes
[:,
0
]
-
boxes
[:,
2
]
/
2.
boxes_xyxy
[:,
1
]
=
boxes
[:,
1
]
-
boxes
[:,
3
]
/
2.
boxes_xyxy
[:,
2
]
=
boxes
[:,
0
]
+
boxes
[:,
2
]
/
2.
boxes_xyxy
[:,
3
]
=
boxes
[:,
1
]
+
boxes
[:,
3
]
/
2.
boxes_xyxy
/=
ratio
dets
=
multiclass_nms
(
boxes_xyxy
,
scores
,
nms_thr
=
0.45
,
score_thr
=
0.1
)
return
dets
def
nms
(
boxes
,
scores
,
nms_thr
):
"""Single class NMS implemented in Numpy."""
x1
=
boxes
[:,
0
]
y1
=
boxes
[:,
1
]
x2
=
boxes
[:,
2
]
y2
=
boxes
[:,
3
]
areas
=
(
x2
-
x1
+
1
)
*
(
y2
-
y1
+
1
)
order
=
scores
.
argsort
()[::
-
1
]
keep
=
[]
while
order
.
size
>
0
:
i
=
order
[
0
]
keep
.
append
(
i
)
xx1
=
np
.
maximum
(
x1
[
i
],
x1
[
order
[
1
:]])
yy1
=
np
.
maximum
(
y1
[
i
],
y1
[
order
[
1
:]])
xx2
=
np
.
minimum
(
x2
[
i
],
x2
[
order
[
1
:]])
yy2
=
np
.
minimum
(
y2
[
i
],
y2
[
order
[
1
:]])
w
=
np
.
maximum
(
0.0
,
xx2
-
xx1
+
1
)
h
=
np
.
maximum
(
0.0
,
yy2
-
yy1
+
1
)
inter
=
w
*
h
ovr
=
inter
/
(
areas
[
i
]
+
areas
[
order
[
1
:]]
-
inter
)
inds
=
np
.
where
(
ovr
<=
nms_thr
)[
0
]
order
=
order
[
inds
+
1
]
return
keep
def
multiclass_nms
(
boxes
,
scores
,
nms_thr
,
score_thr
):
"""Multiclass NMS implemented in Numpy"""
final_dets
=
[]
num_classes
=
scores
.
shape
[
1
]
for
cls_ind
in
range
(
num_classes
):
cls_scores
=
scores
[:,
cls_ind
]
valid_score_mask
=
cls_scores
>
score_thr
if
valid_score_mask
.
sum
()
==
0
:
continue
else
:
valid_scores
=
cls_scores
[
valid_score_mask
]
valid_boxes
=
boxes
[
valid_score_mask
]
keep
=
nms
(
valid_boxes
,
valid_scores
,
nms_thr
)
if
len
(
keep
)
>
0
:
cls_inds
=
np
.
ones
((
len
(
keep
),
1
))
*
cls_ind
dets
=
np
.
concatenate
(
[
valid_boxes
[
keep
],
valid_scores
[
keep
,
None
],
cls_inds
],
1
)
final_dets
.
append
(
dets
)
if
len
(
final_dets
)
==
0
:
return
None
return
np
.
concatenate
(
final_dets
,
0
)
def
get_color_map_list
(
num_classes
):
color_map
=
num_classes
*
[
0
,
0
,
0
]
for
i
in
range
(
0
,
num_classes
):
j
=
0
lab
=
i
while
lab
:
color_map
[
i
*
3
]
|=
(((
lab
>>
0
)
&
1
)
<<
(
7
-
j
))
color_map
[
i
*
3
+
1
]
|=
(((
lab
>>
1
)
&
1
)
<<
(
7
-
j
))
color_map
[
i
*
3
+
2
]
|=
(((
lab
>>
2
)
&
1
)
<<
(
7
-
j
))
j
+=
1
lab
>>=
3
color_map
=
[
color_map
[
i
:
i
+
3
]
for
i
in
range
(
0
,
len
(
color_map
),
3
)]
return
color_map
def
draw_box
(
img
,
boxes
,
scores
,
cls_ids
,
conf
=
0.5
,
class_names
=
None
):
color_list
=
get_color_map_list
(
len
(
class_names
))
for
i
in
range
(
len
(
boxes
)):
box
=
boxes
[
i
]
cls_id
=
int
(
cls_ids
[
i
])
color
=
tuple
(
color_list
[
cls_id
])
score
=
scores
[
i
]
if
score
<
conf
:
continue
x0
=
int
(
box
[
0
])
y0
=
int
(
box
[
1
])
x1
=
int
(
box
[
2
])
y1
=
int
(
box
[
3
])
text
=
'{}:{:.1f}%'
.
format
(
class_names
[
cls_id
],
score
*
100
)
font
=
cv2
.
FONT_HERSHEY_SIMPLEX
txt_size
=
cv2
.
getTextSize
(
text
,
font
,
0.4
,
1
)[
0
]
cv2
.
rectangle
(
img
,
(
x0
,
y0
),
(
x1
,
y1
),
color
,
2
)
cv2
.
rectangle
(
img
,
(
x0
,
y0
+
1
),
(
x0
+
txt_size
[
0
]
+
1
,
y0
+
int
(
1.5
*
txt_size
[
1
])),
color
,
-
1
)
cv2
.
putText
(
img
,
text
,
(
x0
,
y0
+
txt_size
[
1
]),
font
,
0.8
,
(
0
,
255
,
0
),
thickness
=
2
)
return
img
def
get_engine
(
precision
,
model_file_path
):
# TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
TRT_LOGGER
=
trt
.
Logger
()
builder
=
trt
.
Builder
(
TRT_LOGGER
)
config
=
builder
.
create_builder_config
()
if
precision
==
'int8'
:
network
=
builder
.
create_network
(
EXPLICIT_BATCH
|
EXPLICIT_PRECISION
)
else
:
network
=
builder
.
create_network
(
EXPLICIT_BATCH
)
parser
=
trt
.
OnnxParser
(
network
,
TRT_LOGGER
)
runtime
=
trt
.
Runtime
(
TRT_LOGGER
)
if
model_file_path
.
endswith
(
'.trt'
):
# If a serialized engine exists, use it instead of building an engine.
print
(
"Reading engine from file {}"
.
format
(
model_file_path
))
with
open
(
model_file_path
,
"rb"
)
as
f
,
trt
.
Runtime
(
TRT_LOGGER
)
as
runtime
:
engine
=
runtime
.
deserialize_cuda_engine
(
f
.
read
())
for
i
in
range
(
network
.
num_layers
):
layer
=
network
.
get_layer
(
i
)
print
(
i
,
layer
.
name
)
return
engine
else
:
config
.
max_workspace_size
=
1
<<
30
if
precision
==
"fp16"
:
if
not
builder
.
platform_has_fast_fp16
:
print
(
"FP16 is not supported natively on this platform/device"
)
else
:
config
.
set_flag
(
trt
.
BuilderFlag
.
FP16
)
elif
precision
==
"int8"
:
if
not
builder
.
platform_has_fast_int8
:
print
(
"INT8 is not supported natively on this platform/device"
)
else
:
if
builder
.
platform_has_fast_fp16
:
# Also enable fp16, as some layers may be even more efficient in fp16 than int8
config
.
set_flag
(
trt
.
BuilderFlag
.
FP16
)
config
.
set_flag
(
trt
.
BuilderFlag
.
INT8
)
builder
.
max_batch_size
=
1
print
(
'Loading ONNX file from path {}...'
.
format
(
model_file_path
))
with
open
(
model_file_path
,
'rb'
)
as
model
:
print
(
'Beginning ONNX file parsing'
)
if
not
parser
.
parse
(
model
.
read
()):
print
(
'ERROR: Failed to parse the ONNX file.'
)
for
error
in
range
(
parser
.
num_errors
):
print
(
parser
.
get_error
(
error
))
return
None
print
(
'Completed parsing of ONNX file'
)
print
(
'Building an engine from file {}; this may take a while...'
.
format
(
model_file_path
))
plan
=
builder
.
build_serialized_network
(
network
,
config
)
engine
=
runtime
.
deserialize_cuda_engine
(
plan
)
print
(
"Completed creating Engine"
)
with
open
(
model_file_path
,
"wb"
)
as
f
:
f
.
write
(
engine
.
serialize
())
for
i
in
range
(
network
.
num_layers
):
layer
=
network
.
get_layer
(
i
)
print
(
i
,
layer
.
name
)
return
engine
# Simple helper data class that's a little nicer to use than a 2-tuple.
class
HostDeviceMem
(
object
):
def
__init__
(
self
,
host_mem
,
device_mem
):
self
.
host
=
host_mem
self
.
device
=
device_mem
def
__str__
(
self
):
return
"Host:
\n
"
+
str
(
self
.
host
)
+
"
\n
Device:
\n
"
+
str
(
self
.
device
)
def
__repr__
(
self
):
return
self
.
__str__
()
def
allocate_buffers
(
engine
):
inputs
=
[]
outputs
=
[]
bindings
=
[]
stream
=
cuda
.
Stream
()
for
binding
in
engine
:
size
=
trt
.
volume
(
engine
.
get_binding_shape
(
binding
))
*
engine
.
max_batch_size
dtype
=
trt
.
nptype
(
engine
.
get_binding_dtype
(
binding
))
# Allocate host and device buffers
host_mem
=
cuda
.
pagelocked_empty
(
size
,
dtype
)
device_mem
=
cuda
.
mem_alloc
(
host_mem
.
nbytes
)
# Append the device buffer to device bindings.
bindings
.
append
(
int
(
device_mem
))
# Append to the appropriate list.
if
engine
.
binding_is_input
(
binding
):
inputs
.
append
(
HostDeviceMem
(
host_mem
,
device_mem
))
else
:
outputs
.
append
(
HostDeviceMem
(
host_mem
,
device_mem
))
return
inputs
,
outputs
,
bindings
,
stream
def
run_inference
(
context
,
bindings
,
inputs
,
outputs
,
stream
):
# Transfer input data to the GPU.
[
cuda
.
memcpy_htod_async
(
inp
.
device
,
inp
.
host
,
stream
)
for
inp
in
inputs
]
# Run inference.
context
.
execute_async_v2
(
bindings
=
bindings
,
stream_handle
=
stream
.
handle
)
# Transfer predictions back from the GPU.
[
cuda
.
memcpy_dtoh_async
(
out
.
host
,
out
.
device
,
stream
)
for
out
in
outputs
]
# Synchronize the stream
stream
.
synchronize
()
# Return only the host outputs.
return
[
out
.
host
for
out
in
outputs
]
def
main
(
args
):
onnx_model
=
args
.
model_path
img_path
=
args
.
image_file
num_class
=
len
(
CLASS_LABEL
)
repeat
=
1000
engine
=
get_engine
(
args
.
precision
,
onnx_model
)
model_all_names
=
[]
for
idx
in
range
(
engine
.
num_bindings
):
is_input
=
engine
.
binding_is_input
(
idx
)
name
=
engine
.
get_binding_name
(
idx
)
op_type
=
engine
.
get_binding_dtype
(
idx
)
model_all_names
.
append
(
name
)
shape
=
engine
.
get_binding_shape
(
idx
)
print
(
'input id:'
,
idx
,
' is input: '
,
is_input
,
' binding name:'
,
name
,
' shape:'
,
shape
,
'type: '
,
op_type
)
context
=
engine
.
create_execution_context
()
print
(
'Allocate buffers ...'
)
inputs
,
outputs
,
bindings
,
stream
=
allocate_buffers
(
engine
)
print
(
"TRT set input ..."
)
origin_img
=
cv2
.
imread
(
img_path
)
input_shape
=
[
args
.
img_shape
,
args
.
img_shape
]
input_image
,
ratio
=
preprocess
(
origin_img
,
input_shape
)
inputs
[
0
].
host
=
np
.
expand_dims
(
input_image
,
axis
=
0
)
for
_
in
range
(
0
,
50
):
trt_outputs
=
run_inference
(
context
,
bindings
=
bindings
,
inputs
=
inputs
,
outputs
=
outputs
,
stream
=
stream
)
time1
=
time
.
time
()
for
_
in
range
(
0
,
repeat
):
trt_outputs
=
run_inference
(
context
,
bindings
=
bindings
,
inputs
=
inputs
,
outputs
=
outputs
,
stream
=
stream
)
time2
=
time
.
time
()
# total time cost(ms)
total_inference_cost
=
(
time2
-
time1
)
*
1000
print
(
"model path: "
,
onnx_model
,
" precision: "
,
args
.
precision
)
print
(
"In TensorRT, "
,
"average latency is : {} ms"
.
format
(
total_inference_cost
/
repeat
))
# Do postprocess
output
=
trt_outputs
[
0
]
predictions
=
np
.
reshape
(
output
,
(
1
,
-
1
,
int
(
5
+
num_class
)))[
0
]
dets
=
postprocess
(
predictions
,
ratio
)
# Draw rectangles and labels on the original image
if
dets
is
not
None
:
final_boxes
,
final_scores
,
final_cls_inds
=
dets
[:,
:
4
],
dets
[:,
4
],
dets
[:,
5
]
origin_img
=
draw_box
(
origin_img
,
final_boxes
,
final_scores
,
final_cls_inds
,
conf
=
0.5
,
class_names
=
CLASS_LABEL
)
cv2
.
imwrite
(
'output.jpg'
,
origin_img
)
print
(
'The prediction results are saved in output.jpg.'
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model_path'
,
type
=
str
,
default
=
"quant_model.onnx"
,
help
=
"inference model filepath"
)
parser
.
add_argument
(
'--image_file'
,
type
=
str
,
default
=
"bus.jpg"
,
help
=
"image path"
)
parser
.
add_argument
(
'--precision'
,
type
=
str
,
default
=
'fp32'
,
help
=
"support fp32/fp16/int8."
)
parser
.
add_argument
(
'--img_shape'
,
type
=
int
,
default
=
640
,
help
=
"input_size"
)
args
=
parser
.
parse_args
()
main
(
args
)
example/auto_compression/pytorch_yolov7/cpp_infer/CMakeLists.txt
已删除
100644 → 0
浏览文件 @
407d47b3
cmake_minimum_required
(
VERSION 3.0
)
project
(
cpp_inference_demo CXX C
)
option
(
WITH_MKL
"Compile demo with MKL/OpenBlas support, default use MKL."
ON
)
option
(
WITH_GPU
"Compile demo with GPU/CPU, default use CPU."
OFF
)
option
(
WITH_STATIC_LIB
"Compile demo with static/shared library, default use static."
ON
)
option
(
USE_TENSORRT
"Compile demo with TensorRT."
OFF
)
option
(
WITH_ROCM
"Compile demo with rocm."
OFF
)
option
(
WITH_ONNXRUNTIME
"Compile demo with ONNXRuntime"
OFF
)
option
(
WITH_ARM
"Compile demo with ARM"
OFF
)
option
(
WITH_MIPS
"Compile demo with MIPS"
OFF
)
option
(
WITH_SW
"Compile demo with SW"
OFF
)
option
(
WITH_XPU
"Compile demow ith xpu"
OFF
)
option
(
WITH_NPU
"Compile demow ith npu"
OFF
)
if
(
NOT WITH_STATIC_LIB
)
add_definitions
(
"-DPADDLE_WITH_SHARED_LIB"
)
else
()
# PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode.
# Set it to empty in static library mode to avoid compilation issues.
add_definitions
(
"/DPD_INFER_DECL="
)
endif
()
macro
(
safe_set_static_flag
)
foreach
(
flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
)
if
(
${
flag_var
}
MATCHES
"/MD"
)
string
(
REGEX REPLACE
"/MD"
"/MT"
${
flag_var
}
"
${${
flag_var
}}
"
)
endif
(
${
flag_var
}
MATCHES
"/MD"
)
endforeach
(
flag_var
)
endmacro
()
if
(
NOT DEFINED PADDLE_LIB
)
message
(
FATAL_ERROR
"please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib"
)
endif
()
if
(
NOT DEFINED DEMO_NAME
)
message
(
FATAL_ERROR
"please set DEMO_NAME with -DDEMO_NAME=demo_name"
)
endif
()
include_directories
(
"
${
PADDLE_LIB
}
/"
)
set
(
PADDLE_LIB_THIRD_PARTY_PATH
"
${
PADDLE_LIB
}
/third_party/install/"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
protobuf/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
glog/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
gflags/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
xxhash/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
cryptopp/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/include"
)
include_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
paddle2onnx/include"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
protobuf/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
glog/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
gflags/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
xxhash/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
cryptopp/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/paddle/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib"
)
link_directories
(
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
paddle2onnx/lib"
)
if
(
WIN32
)
add_definitions
(
"/DGOOGLE_GLOG_DLL_DECL="
)
option
(
MSVC_STATIC_CRT
"use static C Runtime library by default"
ON
)
if
(
MSVC_STATIC_CRT
)
if
(
WITH_MKL
)
set
(
FLAG_OPENMP
"/openmp"
)
endif
()
set
(
CMAKE_C_FLAGS_DEBUG
"
${
CMAKE_C_FLAGS_DEBUG
}
/bigobj /MTd
${
FLAG_OPENMP
}
"
)
set
(
CMAKE_C_FLAGS_RELEASE
"
${
CMAKE_C_FLAGS_RELEASE
}
/bigobj /MT
${
FLAG_OPENMP
}
"
)
set
(
CMAKE_CXX_FLAGS_DEBUG
"
${
CMAKE_CXX_FLAGS_DEBUG
}
/bigobj /MTd
${
FLAG_OPENMP
}
"
)
set
(
CMAKE_CXX_FLAGS_RELEASE
"
${
CMAKE_CXX_FLAGS_RELEASE
}
/bigobj /MT
${
FLAG_OPENMP
}
"
)
safe_set_static_flag
()
if
(
WITH_STATIC_LIB
)
add_definitions
(
-DSTATIC_LIB
)
endif
()
endif
()
else
()
if
(
WITH_MKL
)
set
(
FLAG_OPENMP
"-fopenmp"
)
endif
()
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-std=c++11
${
FLAG_OPENMP
}
"
)
endif
()
if
(
WITH_GPU
)
if
(
NOT WIN32
)
include_directories
(
"/usr/local/cuda/include"
)
if
(
CUDA_LIB STREQUAL
""
)
set
(
CUDA_LIB
"/usr/local/cuda/lib64/"
CACHE STRING
"CUDA Library"
)
endif
()
else
()
include_directories
(
"C:
\\
Program
\
Files
\\
NVIDIA GPU Computing Toolkit
\\
CUDA
\\
v8.0
\\
include"
)
if
(
CUDA_LIB STREQUAL
""
)
set
(
CUDA_LIB
"C:
\\
Program
\
Files
\\
NVIDIA GPU Computing Toolkit
\\
CUDA
\\
v8.0
\\
lib
\\
x64"
)
endif
()
endif
(
NOT WIN32
)
endif
()
if
(
USE_TENSORRT AND WITH_GPU
)
set
(
TENSORRT_ROOT
""
CACHE STRING
"The root directory of TensorRT library"
)
if
(
"
${
TENSORRT_ROOT
}
"
STREQUAL
""
)
message
(
FATAL_ERROR
"The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH "
)
endif
()
set
(
TENSORRT_INCLUDE_DIR
${
TENSORRT_ROOT
}
/include
)
set
(
TENSORRT_LIB_DIR
${
TENSORRT_ROOT
}
/lib
)
file
(
READ
${
TENSORRT_INCLUDE_DIR
}
/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS
)
string
(
REGEX MATCH
"define NV_TENSORRT_MAJOR +([0-9]+)"
TENSORRT_MAJOR_VERSION
"
${
TENSORRT_VERSION_FILE_CONTENTS
}
"
)
if
(
"
${
TENSORRT_MAJOR_VERSION
}
"
STREQUAL
""
)
file
(
READ
${
TENSORRT_INCLUDE_DIR
}
/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS
)
string
(
REGEX MATCH
"define NV_TENSORRT_MAJOR +([0-9]+)"
TENSORRT_MAJOR_VERSION
"
${
TENSORRT_VERSION_FILE_CONTENTS
}
"
)
endif
()
if
(
"
${
TENSORRT_MAJOR_VERSION
}
"
STREQUAL
""
)
message
(
SEND_ERROR
"Failed to detect TensorRT version."
)
endif
()
string
(
REGEX REPLACE
"define NV_TENSORRT_MAJOR +([0-9]+)"
"
\\
1"
TENSORRT_MAJOR_VERSION
"
${
TENSORRT_MAJOR_VERSION
}
"
)
message
(
STATUS
"Current TensorRT header is
${
TENSORRT_INCLUDE_DIR
}
/NvInfer.h. "
"Current TensorRT version is v
${
TENSORRT_MAJOR_VERSION
}
. "
)
include_directories
(
"
${
TENSORRT_INCLUDE_DIR
}
"
)
link_directories
(
"
${
TENSORRT_LIB_DIR
}
"
)
endif
()
if
(
WITH_MKL
)
set
(
MATH_LIB_PATH
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
mklml"
)
include_directories
(
"
${
MATH_LIB_PATH
}
/include"
)
if
(
WIN32
)
set
(
MATH_LIB
${
MATH_LIB_PATH
}
/lib/mklml
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
${
MATH_LIB_PATH
}
/lib/libiomp5md
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
else
()
set
(
MATH_LIB
${
MATH_LIB_PATH
}
/lib/libmklml_intel
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
MATH_LIB_PATH
}
/lib/libiomp5
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
set
(
MKLDNN_PATH
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
mkldnn"
)
if
(
EXISTS
${
MKLDNN_PATH
}
)
include_directories
(
"
${
MKLDNN_PATH
}
/include"
)
if
(
WIN32
)
set
(
MKLDNN_LIB
${
MKLDNN_PATH
}
/lib/mkldnn.lib
)
else
(
WIN32
)
set
(
MKLDNN_LIB
${
MKLDNN_PATH
}
/lib/libmkldnn.so.0
)
endif
(
WIN32
)
endif
()
elseif
((
NOT WITH_MIPS
)
AND
(
NOT WITH_SW
))
set
(
OPENBLAS_LIB_PATH
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
openblas"
)
include_directories
(
"
${
OPENBLAS_LIB_PATH
}
/include/openblas"
)
if
(
WIN32
)
set
(
MATH_LIB
${
OPENBLAS_LIB_PATH
}
/lib/openblas
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
else
()
set
(
MATH_LIB
${
OPENBLAS_LIB_PATH
}
/lib/libopenblas
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
endif
()
endif
()
if
(
WITH_STATIC_LIB
)
set
(
DEPS
${
PADDLE_LIB
}
/paddle/lib/libpaddle_inference
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
else
()
if
(
WIN32
)
set
(
DEPS
${
PADDLE_LIB
}
/paddle/lib/paddle_inference
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
else
()
set
(
DEPS
${
PADDLE_LIB
}
/paddle/lib/libpaddle_inference
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
endif
()
if
(
WITH_ONNXRUNTIME
)
if
(
WIN32
)
set
(
DEPS
${
DEPS
}
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib/onnxruntime.lib paddle2onnx
)
elseif
(
APPLE
)
set
(
DEPS
${
DEPS
}
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx
)
else
()
set
(
DEPS
${
DEPS
}
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx
)
endif
()
endif
()
if
(
NOT WIN32
)
set
(
EXTERNAL_LIB
"-lrt -ldl -lpthread"
)
set
(
DEPS
${
DEPS
}
${
MATH_LIB
}
${
MKLDNN_LIB
}
glog gflags protobuf xxhash cryptopp
${
EXTERNAL_LIB
}
)
else
()
set
(
DEPS
${
DEPS
}
${
MATH_LIB
}
${
MKLDNN_LIB
}
glog gflags_static libprotobuf xxhash cryptopp-static
${
EXTERNAL_LIB
}
)
set
(
DEPS
${
DEPS
}
shlwapi.lib
)
endif
(
NOT WIN32
)
if
(
WITH_GPU
)
if
(
NOT WIN32
)
if
(
USE_TENSORRT
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/libnvinfer
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/libnvinfer_plugin
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/libcudart
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
else
()
if
(
USE_TENSORRT
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/nvinfer
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/nvinfer_plugin
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
if
(
${
TENSORRT_MAJOR_VERSION
}
GREATER_EQUAL 7
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/myelin64_1
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
endif
()
endif
()
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/cudart
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/cublas
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/cudnn
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
endif
()
endif
()
if
(
WITH_ROCM AND NOT WIN32
)
set
(
DEPS
${
DEPS
}
${
ROCM_LIB
}
/libamdhip64
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
if
(
WITH_XPU AND NOT WIN32
)
set
(
XPU_INSTALL_PATH
"
${
PADDLE_LIB_THIRD_PARTY_PATH
}
xpu"
)
set
(
DEPS
${
DEPS
}
${
XPU_INSTALL_PATH
}
/lib/libxpuapi
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
XPU_INSTALL_PATH
}
/lib/libxpurt
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
if
(
WITH_NPU AND NOT WIN32
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libgraph
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libge_runner
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libascendcl
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libascendcl
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
ASCEND_DIR
}
/ascend-toolkit/latest/fwkacllib/lib64/libacl_op_compiler
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
add_executable
(
${
DEMO_NAME
}
${
DEMO_NAME
}
.cc
)
target_link_libraries
(
${
DEMO_NAME
}
${
DEPS
}
)
if
(
WIN32
)
if
(
USE_TENSORRT
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
TENSORRT_LIB_DIR
}
/nvinfer
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
TENSORRT_LIB_DIR
}
/nvinfer_plugin
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
)
if
(
${
TENSORRT_MAJOR_VERSION
}
GREATER_EQUAL 7
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
TENSORRT_LIB_DIR
}
/myelin64_1
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
)
endif
()
endif
()
if
(
WITH_MKL
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
MATH_LIB_PATH
}
/lib/mklml.dll
${
CMAKE_BINARY_DIR
}
/Release
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
MATH_LIB_PATH
}
/lib/libiomp5md.dll
${
CMAKE_BINARY_DIR
}
/Release
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
MKLDNN_PATH
}
/lib/mkldnn.dll
${
CMAKE_BINARY_DIR
}
/Release
)
else
()
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
OPENBLAS_LIB_PATH
}
/lib/openblas.dll
${
CMAKE_BINARY_DIR
}
/Release
)
endif
()
if
(
WITH_ONNXRUNTIME
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
PADDLE_LIB_THIRD_PARTY_PATH
}
onnxruntime/lib/onnxruntime.dll
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
COMMAND
${
CMAKE_COMMAND
}
-E copy
${
PADDLE_LIB_THIRD_PARTY_PATH
}
paddle2onnx/lib/paddle2onnx.dll
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
)
endif
()
if
(
NOT WITH_STATIC_LIB
)
add_custom_command
(
TARGET
${
DEMO_NAME
}
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy
"
${
PADDLE_LIB
}
/paddle/lib/paddle_inference.dll"
${
CMAKE_BINARY_DIR
}
/
${
CMAKE_BUILD_TYPE
}
)
endif
()
endif
()
example/auto_compression/pytorch_yolov7/cpp_infer/README.md
已删除
100644 → 0
浏览文件 @
407d47b3
# YOLOv7 TensorRT Benchmark测试(Linux)
## 环境准备
-
CUDA、CUDNN:确认环境中已经安装CUDA和CUDNN,并且提前获取其安装路径。
-
TensorRT:可通过NVIDIA官网下载
[
TensorRT 8.4.1.5
](
https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.1/tars/tensorrt-8.4.1.5.linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz
)
或其他版本安装包。
-
Paddle Inference C++预测库:编译develop版本请参考
[
编译文档
](
https://www.paddlepaddle.org.cn/inference/user_guides/source_compile.html
)
。编译完成后,会在build目录下生成
`paddle_inference_install_dir`
文件夹,这个就是我们需要的C++预测库文件。
## 编译可执行程序
-
(1)修改
`compile.sh`
中依赖库路径,主要是以下内容:
```
shell
# Paddle Inference预测库路径
LIB_DIR
=
/root/auto_compress/Paddle/build/paddle_inference_install_dir/
# CUDNN路径
CUDNN_LIB
=
/usr/lib/x86_64-linux-gnu/
# CUDA路径
CUDA_LIB
=
/usr/local/cuda/lib64
# TensorRT安装包路径,为TRT资源包解压完成后的绝对路径,其中包含`lib`和`include`文件夹
TENSORRT_ROOT
=
/root/auto_compress/trt/trt8.4/
```
## 测试
-
FP32
```
./build/trt_run --model_file yolov7_infer/model.pdmodel --params_file yolov7_infer/model.pdiparams --run_mode=trt_fp32
```
-
FP16
```
./build/trt_run --model_file yolov7_infer/model.pdmodel --params_file yolov7_infer/model.pdiparams --run_mode=trt_fp16
```
-
INT8
```
./build/trt_run --model_file yolov7_quant/model.pdmodel --params_file yolov7_quant/model.pdiparams --run_mode=trt_int8
```
## 性能对比
| 预测库 | 模型 | 预测时延
<sup><small>
FP32
</small><sup><br><sup>
(ms) |预测时延
<sup><small>
FP16
</small><sup><br><sup>
(ms) | 预测时延
<sup><small>
INT8
</small><sup><br><sup>
(ms) |
| :--------: | :--------: |:-------- |:--------: | :---------------------: |
| Paddle TensorRT | YOLOv7 | 26.84ms | 7.44ms | 4.55ms |
| TensorRT | YOLOv7 | 28.25ms | 7.23ms | 4.67ms |
环境:
-
Tesla T4,TensorRT 8.4.1,CUDA 11.2
-
batch_size=1
example/auto_compression/pytorch_yolov7/cpp_infer/compile.sh
已删除
100644 → 0
浏览文件 @
407d47b3
#!/bin/bash
set
+x
set
-e
work_path
=
$(
dirname
$(
readlink
-f
$0
))
mkdir
-p
build
cd
build
rm
-rf
*
DEMO_NAME
=
trt_run
WITH_MKL
=
ON
WITH_GPU
=
ON
USE_TENSORRT
=
ON
LIB_DIR
=
/root/auto_compress/Paddle/build/paddle_inference_install_dir/
CUDNN_LIB
=
/usr/lib/x86_64-linux-gnu/
CUDA_LIB
=
/usr/local/cuda/lib64
TENSORRT_ROOT
=
/root/auto_compress/trt/trt8.4/
WITH_ROCM
=
OFF
ROCM_LIB
=
/opt/rocm/lib
cmake ..
-DPADDLE_LIB
=
${
LIB_DIR
}
\
-DWITH_MKL
=
${
WITH_MKL
}
\
-DDEMO_NAME
=
${
DEMO_NAME
}
\
-DWITH_GPU
=
${
WITH_GPU
}
\
-DWITH_STATIC_LIB
=
OFF
\
-DUSE_TENSORRT
=
${
USE_TENSORRT
}
\
-DWITH_ROCM
=
${
WITH_ROCM
}
\
-DROCM_LIB
=
${
ROCM_LIB
}
\
-DCUDNN_LIB
=
${
CUDNN_LIB
}
\
-DCUDA_LIB
=
${
CUDA_LIB
}
\
-DTENSORRT_ROOT
=
${
TENSORRT_ROOT
}
make
-j
example/auto_compression/pytorch_yolov7/cpp_infer/trt_run.cc
已删除
100644 → 0
浏览文件 @
407d47b3
#include <chrono>
#include <iostream>
#include <memory>
#include <numeric>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <cuda_runtime.h>
#include "paddle/include/paddle_inference_api.h"
#include "paddle/include/experimental/phi/common/float16.h"
using
paddle_infer
::
Config
;
using
paddle_infer
::
Predictor
;
using
paddle_infer
::
CreatePredictor
;
using
paddle_infer
::
PrecisionType
;
using
phi
::
dtype
::
float16
;
DEFINE_string
(
model_dir
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
model_file
,
""
,
"Path of the inference model file."
);
DEFINE_string
(
params_file
,
""
,
"Path of the inference params file."
);
DEFINE_string
(
run_mode
,
"trt_fp32"
,
"run_mode which can be: trt_fp32, trt_fp16 and trt_int8"
);
DEFINE_int32
(
batch_size
,
1
,
"Batch size."
);
DEFINE_int32
(
gpu_id
,
0
,
"GPU card ID num."
);
DEFINE_int32
(
trt_min_subgraph_size
,
3
,
"tensorrt min_subgraph_size"
);
DEFINE_int32
(
warmup
,
50
,
"warmup"
);
DEFINE_int32
(
repeats
,
1000
,
"repeats"
);
using
Time
=
decltype
(
std
::
chrono
::
high_resolution_clock
::
now
());
Time
time
()
{
return
std
::
chrono
::
high_resolution_clock
::
now
();
};
double
time_diff
(
Time
t1
,
Time
t2
)
{
typedef
std
::
chrono
::
microseconds
ms
;
auto
diff
=
t2
-
t1
;
ms
counter
=
std
::
chrono
::
duration_cast
<
ms
>
(
diff
);
return
counter
.
count
()
/
1000.0
;
}
std
::
shared_ptr
<
Predictor
>
InitPredictor
()
{
Config
config
;
std
::
string
model_path
;
if
(
FLAGS_model_dir
!=
""
)
{
config
.
SetModel
(
FLAGS_model_dir
);
model_path
=
FLAGS_model_dir
.
substr
(
0
,
FLAGS_model_dir
.
find_last_of
(
"/"
));
}
else
{
config
.
SetModel
(
FLAGS_model_file
,
FLAGS_params_file
);
model_path
=
FLAGS_model_file
.
substr
(
0
,
FLAGS_model_file
.
find_last_of
(
"/"
));
}
// enable tune
std
::
cout
<<
"model_path: "
<<
model_path
<<
std
::
endl
;
config
.
EnableUseGpu
(
256
,
FLAGS_gpu_id
);
if
(
FLAGS_run_mode
==
"trt_fp32"
)
{
config
.
EnableTensorRtEngine
(
1
<<
30
,
FLAGS_batch_size
,
FLAGS_trt_min_subgraph_size
,
PrecisionType
::
kFloat32
,
false
,
false
);
}
else
if
(
FLAGS_run_mode
==
"trt_fp16"
)
{
config
.
EnableTensorRtEngine
(
1
<<
30
,
FLAGS_batch_size
,
FLAGS_trt_min_subgraph_size
,
PrecisionType
::
kHalf
,
false
,
false
);
}
else
if
(
FLAGS_run_mode
==
"trt_int8"
)
{
config
.
EnableTensorRtEngine
(
1
<<
30
,
FLAGS_batch_size
,
FLAGS_trt_min_subgraph_size
,
PrecisionType
::
kInt8
,
false
,
false
);
}
config
.
EnableMemoryOptim
();
config
.
SwitchIrOptim
(
true
);
return
CreatePredictor
(
config
);
}
template
<
typename
type
>
void
run
(
Predictor
*
predictor
,
const
std
::
vector
<
type
>
&
input
,
const
std
::
vector
<
int
>
&
input_shape
,
type
*
out_data
,
std
::
vector
<
int
>
out_shape
)
{
// prepare input
int
input_num
=
std
::
accumulate
(
input_shape
.
begin
(),
input_shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
auto
input_names
=
predictor
->
GetInputNames
();
auto
input_t
=
predictor
->
GetInputHandle
(
input_names
[
0
]);
input_t
->
Reshape
(
input_shape
);
input_t
->
CopyFromCpu
(
input
.
data
());
for
(
int
i
=
0
;
i
<
FLAGS_warmup
;
++
i
)
CHECK
(
predictor
->
Run
());
auto
st
=
time
();
for
(
int
i
=
0
;
i
<
FLAGS_repeats
;
++
i
)
{
auto
input_names
=
predictor
->
GetInputNames
();
auto
input_t
=
predictor
->
GetInputHandle
(
input_names
[
0
]);
input_t
->
Reshape
(
input_shape
);
input_t
->
CopyFromCpu
(
input
.
data
());
CHECK
(
predictor
->
Run
());
auto
output_names
=
predictor
->
GetOutputNames
();
auto
output_t
=
predictor
->
GetOutputHandle
(
output_names
[
0
]);
std
::
vector
<
int
>
output_shape
=
output_t
->
shape
();
output_t
->
ShareExternalData
<
type
>
(
out_data
,
out_shape
,
paddle_infer
::
PlaceType
::
kGPU
);
}
LOG
(
INFO
)
<<
"["
<<
FLAGS_run_mode
<<
" bs-"
<<
FLAGS_batch_size
<<
" ] run avg time is "
<<
time_diff
(
st
,
time
())
/
FLAGS_repeats
<<
" ms"
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
google
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
auto
predictor
=
InitPredictor
();
std
::
vector
<
int
>
input_shape
=
{
FLAGS_batch_size
,
3
,
640
,
640
};
// float16
using
dtype
=
float16
;
std
::
vector
<
dtype
>
input_data
(
FLAGS_batch_size
*
3
*
640
*
640
,
dtype
(
1.0
));
dtype
*
out_data
;
int
out_data_size
=
FLAGS_batch_size
*
25200
*
85
;
cudaHostAlloc
((
void
**
)
&
out_data
,
sizeof
(
float
)
*
out_data_size
,
cudaHostAllocMapped
);
std
::
vector
<
int
>
out_shape
{
FLAGS_batch_size
,
1
,
25200
,
85
};
run
<
dtype
>
(
predictor
.
get
(),
input_data
,
input_shape
,
out_data
,
out_shape
);
return
0
;
}
example/auto_compression/pytorch_yolov7/images/000000570688.jpg
已删除
100644 → 0
浏览文件 @
407d47b3
135.1 KB
example/auto_compression/pytorch_yolov7/post_quant.py
已删除
100644 → 0
浏览文件 @
407d47b3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
numpy
as
np
import
argparse
import
paddle
from
paddleslim.common
import
load_config
from
paddleslim.common
import
load_onnx_model
from
paddleslim.quant
import
quant_post_static
from
dataset
import
COCOTrainDataset
def
argsparser
():
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
None
,
help
=
"path of compression strategy config."
,
required
=
True
)
parser
.
add_argument
(
'--save_dir'
,
type
=
str
,
default
=
'ptq_out'
,
help
=
"directory to save compressed model."
)
parser
.
add_argument
(
'--devices'
,
type
=
str
,
default
=
'gpu'
,
help
=
"which device used to compress."
)
parser
.
add_argument
(
'--algo'
,
type
=
str
,
default
=
'KL'
,
help
=
"post quant algo."
)
return
parser
def
main
():
global
global_config
all_config
=
load_config
(
FLAGS
.
config_path
)
global_config
=
all_config
[
"Global"
]
dataset
=
COCOTrainDataset
(
dataset_dir
=
global_config
[
'dataset_dir'
],
image_dir
=
global_config
[
'val_image_dir'
],
anno_path
=
global_config
[
'val_anno_path'
])
train_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
0
)
place
=
paddle
.
CUDAPlace
(
0
)
if
FLAGS
.
devices
==
'gpu'
else
paddle
.
CPUPlace
()
exe
=
paddle
.
static
.
Executor
(
place
)
load_onnx_model
(
global_config
[
"model_dir"
])
inference_model_path
=
global_config
[
"model_dir"
].
rstrip
().
rstrip
(
'.onnx'
)
+
'_infer'
quant_post_static
(
executor
=
exe
,
model_dir
=
inference_model_path
,
quantize_model_path
=
FLAGS
.
save_dir
,
data_loader
=
train_loader
,
model_filename
=
'model.pdmodel'
,
params_filename
=
'model.pdiparams'
,
batch_size
=
32
,
batch_nums
=
10
,
algo
=
FLAGS
.
algo
,
hist_percent
=
0.999
,
is_full_quantize
=
False
,
bias_correction
=
False
,
onnx_format
=
True
)
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
parser
=
argsparser
()
FLAGS
=
parser
.
parse_args
()
assert
FLAGS
.
devices
in
[
'cpu'
,
'gpu'
,
'xpu'
,
'npu'
]
paddle
.
set_device
(
FLAGS
.
devices
)
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录