提交 8c1cb2af 编写于 作者: C chenjiaoAngel

update cide

Merge branch 'int8' of https://github.com/chenjiaoAngel/Paddle-Lite into int8
......@@ -63,6 +63,16 @@ test/models/
test/images/
*.pyc
# model
*.nb
*.svg
*.dot
# vim intermediate files
*.swp
# Emacs intermediate files
*~
......
......@@ -97,7 +97,7 @@ lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF)
lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF)
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
# publish options
......
......@@ -61,7 +61,8 @@ For demands of Apple's GPU Metal and web front end inference, please see `./meta
Paddle Lite has referenced the following open-source projects:
- [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29)
- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite.
- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite.
## Feedback and Community Support
......
......@@ -186,8 +186,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
add_definitions("-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK")
endif()
if (LITE_SHUTDOWN_LOG)
add_definitions("-DLITE_SHUTDOWN_LOG")
if (LITE_WITH_LOG)
add_definitions("-DLITE_WITH_LOG")
endif()
if (LITE_ON_TINY_PUBLISH)
......
......@@ -32,34 +32,3 @@ endif()
message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")
include_directories("${APU_DDK_ROOT}/include")
set(APU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(APU_SUB_LIB_PATH "lib64")
endif()
find_library(APU_NEURON_FILE NAMES neuron
PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter
PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
if(NOT APU_NEURON_FILE)
message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}")
else()
message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}")
add_library(apu_neuron SHARED IMPORTED GLOBAL)
set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE})
endif()
if(NOT APU_NEURON_ADAPTER_FILE)
message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}")
else()
message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}")
add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL)
set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE})
endif()
set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs")
message(STATUS "${apu_runtime_libs}")
......@@ -45,7 +45,7 @@ else()
# we changed the source code to adapt for windows compiling
# git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
######################################################################################################
URL https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
URL http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR}
......
......@@ -400,7 +400,7 @@ std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>
- `None`
返回:内存中模型结构数据
返回:内存中模型参数数据
返回类型:`const std::string&`
......
# PaddleLite使用百度XPU预测部署
Paddle Lite已支持百度XPU在x86和arm服务器(例如飞腾 FT-2000+/64)上进行预测部署。
目前支持Kernel和子图两种接入方式,其中子图接入方式与之前华为NPU类似,即加载并分析Paddle模型,将Paddle算子转成XTCL组网API进行网络构建,在线生成并执行模型。
## 支持现状
### 已支持的芯片
- 昆仑818-100(推理芯片)
- 昆仑818-300(训练芯片)
### 已支持的设备
- K100/K200昆仑AI加速卡
### 已支持的Paddle模型
- [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)
- [BERT](https://paddlelite-demo.bj.bcebos.com/models/bert_fp32_fluid.tar.gz)
- [ERNIE](https://paddlelite-demo.bj.bcebos.com/models/ernie_fp32_fluid.tar.gz)
- YOLOv3
- Mask R-CNN
- Faster R-CNN
- UNet
- SENet
- SSD
- 百度内部业务模型(由于涉密,不方便透露具体细节)
### 已支持(或部分支持)的Paddle算子(Kernel接入方式)
- scale
- relu
- tanh
- sigmoid
- stack
- matmul
- pool2d
- slice
- lookup_table
- elementwise_add
- elementwise_sub
- cast
- batch_norm
- mul
- layer_norm
- softmax
- conv2d
- io_copy
- io_copy_once
- __xpu__fc
- __xpu__multi_encoder
- __xpu__resnet50
- __xpu__embedding_with_eltwise_add
### 已支持(或部分支持)的Paddle算子(子图/XTCL接入方式)
- relu
- tanh
- conv2d
- depthwise_conv2d
- elementwise_add
- pool2d
- softmax
- mul
- batch_norm
- stack
- gather
- scale
- lookup_table
- slice
- transpose
- transpose2
- reshape
- reshape2
- layer_norm
- gelu
- dropout
- matmul
- cast
- yolo_box
## 参考示例演示
### 测试设备(K100昆仑AI加速卡)
![baidu_xpu](https://paddlelite-demo.bj.bcebos.com/devices/baidu/baidu_xpu.jpg)
### 准备设备环境
- K100/200昆仑AI加速卡[规格说明书](https://paddlelite-demo.bj.bcebos.com/devices/baidu/K100_K200_spec.pdf),如需更详细的规格说明书或购买产品,请联系欧阳剑ouyangjian@baidu.com;
- K100为全长半高PCI-E卡,K200为全长全高PCI-E卡,要求使用PCI-E x16插槽,且需要单独的8针供电线进行供电;
- 安装K100/K200驱动,目前支持Ubuntu和CentOS系统,由于驱动依赖Linux kernel版本,请正确安装对应版本的驱动安装包。
### 准备本地编译环境
- 为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Linux开发环境进行配置;
- 由于编译示例程序需要依赖OpenCV和CMake 3.10.3,请执行如下命令进行安装;
```shell
$ sudo apt-get update
$ sudo apt-get install gcc g++ make wget unzip libopencv-dev pkg-config
$ wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz
$ tar -zxvf cmake-3.10.3.tar.gz
$ cd cmake-3.10.3
$ ./configure
$ make
$ sudo make install
```
### 运行图像分类示例程序
-[https://paddlelite-demo.bj.bcebos.com/devices/baidu/PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/baidu/PaddleLite-linux-demo.tar.gz)下载示例程序,解压后清单如下:
```shell
- PaddleLite-linux-demo
- image_classification_demo
- assets
- images
- tabby_cat.jpg # 测试图片
- labels
- synset_words.txt # 1000分类label文件
- models
- resnet50_fp32_224_fluid # Paddle fluid non-combined格式的resnet50 float32模型
- __model__ # Paddle fluid模型组网文件,可拖入https://lutzroeder.github.io/netron/进行可视化显示网络结构
- bn2a_branch1_mean # Paddle fluid模型参数文件
- bn2a_branch1_scale
...
- shell
- CMakeLists.txt # 示例程序CMake脚本
- build
- image_classification_demo # 已编译好的,适用于amd64的示例程序
- image_classification_demo.cc # 示例程序源码
- build.sh # 示例程序编译脚本
- run.sh # 示例程序运行脚本
- libs
- PaddleLite
- amd64
- include # PaddleLite头文件
- lib
- libiomp5.so # Intel OpenMP库
- libmklml_intel.so # Intel MKL库
- libxpuapi.so # XPU API库,提供设备管理和算子实现。
- llibxpurt.so # XPU runtime库
- libpaddle_full_api_shared.so # 预编译PaddleLite full api库
- arm64
- include # PaddleLite头文件
- lib
- libxpuapi.so # XPU API库,提供设备管理和算子实现。
- llibxpurt.so # XPU runtime库
- libpaddle_full_api_shared.so # 预编译PaddleLite full api库
```
- 进入PaddleLite-linux-demo/image_classification_demo/shell,直接执行./run.sh amd64即可;
```shell
$ cd PaddleLite-linux-demo/image_classification_demo/shell
$ ./run.sh amd64 # 默认已生成amd64版本的build/image_classification_demo,因此,无需重新编译示例程序就可以执行。
$ ./run.sh arm64 # 需要在arm64(FT-2000+/64)服务器上执行./build.sh arm64后才能执行该命令。
...
AUTOTUNE:(12758016, 16, 1, 2048, 7, 7, 512, 1, 1, 1, 1, 0, 0, 0) = 1by1_bsp(1, 32, 128, 128)
Find Best Result in 150 choices, avg-conv-op-time = 40 us
[INFO][XPUAPI][/home/qa_work/xpu_workspace/xpu_build_dailyjob/api_root/baidu/xpu/api/src/wrapper/conv.cpp:274] Start Tuning: (12758016, 16, 1, 512, 7, 7, 512, 3, 3, 1, 1, 1, 1, 0)
AUTOTUNE:(12758016, 16, 1, 512, 7, 7, 512, 3, 3, 1, 1, 1, 1, 0) = wpinned_bsp(1, 171, 16, 128)
Find Best Result in 144 choices, avg-conv-op-time = 79 us
I0502 22:34:18.176113 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
I0502 22:34:18.176406 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
I0502 22:34:18.176697 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
iter 0 cost: 2.116000 ms
I0502 22:34:18.178530 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
I0502 22:34:18.178792 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
iter 1 cost: 2.101000 ms
I0502 22:34:18.180634 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
I0502 22:34:18.180881 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
iter 2 cost: 2.089000 ms
I0502 22:34:18.182726 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
I0502 22:34:18.182976 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
iter 3 cost: 2.085000 ms
I0502 22:34:18.184814 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
I0502 22:34:18.185068 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
iter 4 cost: 2.101000 ms
warmup: 1 repeat: 5, average: 2.098400 ms, max: 2.116000 ms, min: 2.085000 ms
results: 3
Top0 tabby, tabby cat - 0.689418
Top1 tiger cat - 0.190557
Top2 Egyptian cat - 0.112354
Preprocess time: 1.553000 ms
Prediction time: 2.098400 ms
Postprocess time: 0.081000 ms
```
- 如果需要更改测试图片,可将图片拷贝到PaddleLite-linux-demo/image_classification_demo/assets/images目录下,然后将run.sh的IMAGE_NAME设置成指定文件名即可;
- 如果需要重新编译示例程序,直接运行./build.sh amd64或./build.sh arm64即可。
```shell
$ cd PaddleLite-linux-demo/image_classification_demo/shell
$ ./build.sh amd64 # For amd64
$ ./build.sh arm64 # For arm64(FT-2000+/64)
```
### 更新模型
- 通过Paddle Fluid训练,或X2Paddle转换得到ResNet50 float32模型[resnet50_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)
- 由于XPU一般部署在Server端,因此将使用PaddleLite的full api加载原始的Paddle Fluid模型进行预测,即采用CXXConfig配置相关参数。
### 更新支持百度XPU的Paddle Lite库
- 下载PaddleLite源码;
```shell
$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
$ cd Paddle-Lite
$ git checkout <release-version-tag>
```
- 下载xpu_toolchain for amd64 or arm64(FT-2000+/64);
```shell
$ wget <URL_to_download_xpu_toolchain>
$ tar -xvf output.tar.gz
$ mv output xpu_toolchain
```
- 编译full_publish for amd64 or arm64(FT-2000+/64);
```shell
For amd64,如果报找不到cxx11::符号的编译错误,请将gcc切换到4.8版本。
$ ./lite/tools/build.sh --build_xpu=ON --xpu_sdk_root=./xpu_toolchain x86
For arm64(FT-2000+/64)
$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --build_xpu=ON --xpu_sdk_root=./xpu_toolchain --with_log=ON full_publish
```
- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/amd64/include目录;
- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件;
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录;
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件。
## 其它说明
- 如需更进一步的了解相关产品的信息,请联系欧阳剑ouyangjian@baidu.com;
- 百度昆仑的研发同学正在持续适配更多的Paddle算子,以便支持更多的Paddle模型。
......@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib`
4、 `demo` 文件夹:c++ demo.
如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`
如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`
## 运行
......@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
二: 运行
**NOTE:**此处示例使用的是python接口。
**NOTE:** 此处示例使用的是python接口。
``` python
#-*- coding: utf-8 -*-
......@@ -75,7 +75,7 @@ import sys
import numpy as np
import cv2
sys.path.append('build_cuda/inference_lite_lib/python/lib')
from lite_core import *
from lite import *
def read_img(im_path, resize_h, resize_w):
im = cv2.imread(im_path).astype('float32')
......
# PaddleLite使用MTK APU预测部署
Paddle Lite已支持MTK APU的预测部署。
其接入原理是与之前华为NPU类似,即加载并分析Paddle模型,将Paddle算子转成MTK的Neuron adapter API(类似Android NN API)进行网络构建,在线生成并执行模型。
## 支持现状
### 已支持的芯片
- [MT8168](https://www.mediatek.cn/products/tablets/mt8168)/[MT8175](https://www.mediatek.cn/products/tablets/mt8175)及其他智能芯片。
### 已支持的设备
- MT8168-P2V1 Tablet。
### 已支持的Paddle模型
- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mobilenet_v1_int8_224_fluid.tar.gz)
### 已支持(或部分支持)的Paddle算子
- relu
- conv2d
- depthwise_conv2d
- elementwise_add
- elementwise_mul
- fc
- pool2d
- softmax
## 参考示例演示
### 测试设备(MT8168-P2V1 Tablet)
![mt8168_p2v1_tablet_front](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_front.jpg)
![mt8168_p2v1_tablet_back](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_back.jpg)
### 准备设备环境
- 由于需要依赖特定版本的firmware,感兴趣的同学通过MTK官网[https://www.mediatek.cn/about/contact-us](https://www.mediatek.cn/about/contact-us)提供的联系方式(类别请选择"销售"),获取测试设备和firmware;
### 准备交叉编译环境
- 为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置。
### 运行图像分类示例程序
-[https://paddlelite-demo.bj.bcebos.com/devices/mediatek/PaddleLite-android-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/PaddleLite-android-demo.tar.gz)下载示例程序,解压后清单如下:
```shell
- PaddleLite-android-demo
- image_classification_demo
- assets
- images
- tabby_cat.jpg # 测试图片
- labels
- synset_words.txt # 1000分类label文件
- models
- mobilenet_v1_int8_224_for_cpu.nb # 已通过opt转好的、适合arm cpu的mobilenetv1量化模型
- mobilenet_v1_int8_224_for_apu.nb # 已通过opt转好的、适合mtk apu的mobilenetv1量化模型
- shell # android shell端的示例程序
- CMakeLists.txt # 示例程序CMake脚本
- build
- image_classification_demo # 已编译好的android shell端的示例程序
- image_classification_demo.cc # 示例程序源码
- build.sh # 示例程序编译脚本
- run.sh # 示例程序运行脚本
- apk # 常规android应用程序
- app
- src
- main
- java # java层代码
- cpp # 自定义的jni实现
- app.iml
- build.gradle
- gradle
...
- libs
- PaddleLite
- arm64-v8a
- include # PaddleLite头文件
- lib
- libc++_shared.so
- libpaddle_light_api_shared.so # 预编译PaddleLite库
- OpenCV # OpenCV 4.2 for android
```
- Android shell端的示例程序
- 进入PaddleLite-android-demo/image_classification_demo/shell,直接执行./run.sh即可,注意:run.sh不能在docker环境执行,否则可能无法找到设备;
- 如果需要更改测试图片,可将图片拷贝到PaddleLite-android-demo/image_classification_demo/assets/images目录下,然后将run.sh的IMAGE_NAME设置成指定文件名即可;
- 如果需要重新编译示例程序,直接运行./build.sh即可,注意:build.sh的执行必须在docker环境中,否则可能编译出错;
- 需要说明的是,由于MTK APU暂时只支持NHWC的数据布局格式,而PaddleLite默认使用NCHW的数据布局格式,导致额外增加了预测中输入张量的NCHW到NHWC的转换,大约耗费8~9ms。
```shell
$ cd PaddleLite-android-demo/image_classification_demo/shell
$ ./run.sh
...
warmup: 5 repeat: 10, average: 30.998502 ms, max: 31.049002 ms, min: 30.937002 ms
results: 3
Top0 Egyptian cat - -0.122845
Top1 tabby, tabby cat - -0.122845
Top2 tiger cat - -0.544028
Preprocess time: 3.620000 ms
Prediction time: 30.998502 ms
Postprocess time: 0.069000 ms
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b00000, pa = 0xfb3f9000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af8000, pa = 0xfb3fa000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af7000, pa = 0xf8ffe000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af6000, pa = 0xf7bfe000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af5000, pa = 0xf7bfd000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0c000, pa = 0xfb3fe000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0b000, pa = 0xfb3ff000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0a000, pa = 0xf31ff000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b09000, pa = 0xfb3f6000, len = 255
[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b08000, pa = 0xf7bff000, len = 255
```
- 常规Android应用程序
- 安装Android Studio 3.4
- 打开Android Studio,在"Welcome to Android Studio"窗口点击"Open an existing Android Studio project",在弹出的路径选择窗口中进入"image_classification_demo"目录,然后点击右下角的"Open"按钮即可导入工程
- 通过USB连接Android手机、平板或开发板;
- 临时关闭selinux模式,允许app调用系统库;
```shell
$ adb root
# setenforce 0
```
- 载入工程后,点击菜单栏的Run->Run 'App'按钮,在弹出的"Select Deployment Target"窗口选择已经连接的Android设备,然后点击"OK"按钮;
- 等待大约1分钟后(第一次时间比较长,需要耐心等待),app已经安装到设备上。默认使用ARM CPU模型进行预测,由于MT8168的CPU由四核Arm-Cortex A53组成,性能较一般手机的A7x系列要弱很多,如下图所示,只有6fps;
![mt8168_p2v1_tablet_cpu](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_cpu.jpg)
- 点击app界面右下角的设置按钮,在弹出的设置页面点击"Choose pre-installed models",选择"mobilenet_v1_int8_for_apu",点击返回按钮后,app将切换到APU模型,如下图所示,帧率提高到14fps。
![mt8168_p2v1_tablet_apu](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_apu.jpg)
### 更新模型
- 通过Paddle Fluid训练,或X2Paddle转换得到MobileNetv1 foat32模型[mobilenet_v1_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz)
- 参考[模型量化-有校准数据训练后量化](../user_guides/post_quant_with_data)使用PaddleSlim对float32模型进行量化(注意:由于MTK APU只支持量化OP,在启动量化脚本时请注意相关参数的设置),最终得到全量化MobileNetV1模型[mobilenet_v1_int8_224_fluid](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mobilenet_v1_int8_224_fluid.tar.gz)
- 参考[模型转化方法](../user_guides/model_optimize_tool),利用opt工具转换生成MTK APU模型,仅需要将valid_targets设置为apu,arm即可。
```shell
$ ./opt --model_dir=mobilenet_v1_int8_224_fluid \
--optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_int8_224_for_apu \
--valid_targets=apu,arm
```
- 注意:opt生成的模型只是标记了MTK APU支持的Paddle算子,并没有真正生成MTK APU模型,只有在执行时才会将标记的Paddle算子转成MTK Neuron adapter API调用实现组网,最终生成并执行模型。
### 更新支持MTK APU的Paddle Lite库
- 下载PaddleLite源码和APU DDK;
```shell
$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
$ cd Paddle-Lite
$ git checkout <release-version-tag>
$ wget https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz
$ tar -xvf apu_ddk.tar.gz
```
- 编译full_publish and tiny_publish for MT8168-P2V1 Tablet
```shell
$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk full_publish
$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk tiny_publish
```
- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/include目录;
- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件。
## 其它说明
- 由于涉及到License的问题,无法提供用于测试的firmware,我们深感抱歉。如果确实对此非常感兴趣,可以参照之前提到的联系方式,直接联系MTK的销售;
- MTK研发同学正在持续增加用于适配Paddle算子bridge/converter,以便适配更多Paddle模型。
......@@ -126,3 +126,80 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
- 华为达芬奇架构的NPU内部大量采用float16进行运算,因此,预测结果会存在偏差,但大部分情况下精度不会有较大损失,可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。
- 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU,与Kirin 970/980 Soc搭载的寒武纪NPU不一样,同样的,与Hi3559A、Hi3519A使用的NNIE也不一样,Paddle Lite只支持华为自研达芬奇架构NPU。
- 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter,以便适配更多Paddle模型,同时华为研发同学也在持续对HiAI IR性能进行优化。
## 手动分割子图
### 背景
- Paddle-Lite已经支持了大量的华为NPU的算子,但是仍然不能满足所有模型的需求。对于一个有部分算子不支持的模型,Paddle-Lite会将模型划分为可以跑在NPU上的子图和跑在CPU上的子图,实现NPU和CPU自动调度的功能,通常情况下可以获得比较好的性能。在一些特殊情况下,模型会被自动划分为比较多的子图,导致CPU和NPU的切换开销很大,从而导致整体性能变差。因此,需要手动分割子图的功能来指定一些算子跑在CPU上,避免子图过多。
### 功能
- 通过配置文件来指定需要强制跑在CPU上的算子
### 使用方法
- 1、通过netron打开paddle模型文件,可以查看模型结构,获得算子的类型、输入名称。输出名称。
- 注意:Paddle-Lite会对模型进行优化,模型算子可以改变,需要以优化后的模型算子为准。后面会举例说明。
- 2、生成配置文件 ```split_cfg.txt```,记录需要跑在CPU上的算子信息。
- 每行一条OP记录信息,以冒号":"分隔"op名称","op输入名","op输出名",以逗号","分隔"op输入名"和"op输出名"中的不同var名。
- 可以部分省略输入或者输出名。比如:```op3:in3_var0```表示,指定类型为"op3",输入为"in3_var0"的算子;```op4```表示所有类型为"op4"的算子
- 例子1:
```
op0:in0_var0,in0_var1:out0_var0,out0_var1
op1:in1_var0,in1_var1:out1_var0
op2::out2_var0
op3:in3_var0
op4
```
- 例子2:
```
transpose:conv2d_22.tmp_1:transpose_0.tmp_0
```
![image](https://user-images.githubusercontent.com/50474132/80475316-4a5fda80-897b-11ea-910a-6aee13243387.png)
- 3、使用环境变量```SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE```指定配置文件的位置。
- 例如:
```
export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=/data/local/tmp/split_sfg.txt
```
- 4、以上步骤完成后,运行的模型中符合条件的算子将被强制跑在CPU上。
### 举例
- 以模型[image](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)为例
- 1、可以使用netron查看模型
- 2、初步分析
- 下图是ssd_mobilenet_v1中的部分结构。其中红色部分暂时不支持在NPU上运行,蓝色部分可能NPU上的性能不理想。此时,如果直接让预测库自动调度的话,可能会分成多个子图,而且整体性能不佳。因此,可以将蓝色部分和绿色部分整体指定在CPU上运行,让其他部分自动运行在NPU上(红色部分会自动在CPU上运行)。
![ssd_mobilenet_v1_example](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png)
- 3、使用opt转换模型
- opt转换过程中会打印log信息。在log中搜索```digraph G```和```// end G```可以找到优化后的模型图。
![image](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png)
![image](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png)
- 将从```digraph G```开始的,到```// end G```结束的整段模型图信息,保存到```.dot```格式的文件中。可以用```graphviz```打开查看,或者在[网页版](http://dreampuf.github.io/GraphvizOnline/)查看。
![image](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png)
- 在此处确认需要被指定的算子是否被优化了。(期望是被指定的算子都还独立存在,如果被融合为了一个算子,需要指定此时融合后的算子)。
- 4、写配置文件
- 在配置文件中指定可以支持NPU但是需要指定在CPU上运行的算子。
```
reshape
transpose
concat
softmax
```
- 由于这些算子都指定在NPU上运行,因此不需要特意配置算子的输入输出名称。
- 5、指定配置文件路径
- 通过```export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=your_split_config_file```的方式实现。
- 6、性能测试
- 设备:华为mate30 5G
- HIAI ddk版本:320
- 性能:CPU约71.8ms,NPU约16.6ms。
# PaddleLite使用RK NPU预测部署
Paddle Lite已支持RK NPU的预测部署。
其接入原理是与之前华为NPU类似,即加载并分析Paddle模型,将Paddle算子转成RK组网API进行网络构建,在线生成并执行模型。
## 支持现状
### 已支持的芯片
- RK1808, RK1806,暂时不支持RK3399Pro。
### 已支持的设备
- RK1808/1806 EVB。
### 已支持的Paddle模型
- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/mobilenet_v1_int8_224_fluid.tar.gz)
### 已支持(或部分支持)的Paddle算子
- relu
- conv2d
- depthwise_conv2d
- pool2d
- fc
- softmax
- batch_norm
- concat
- elementwise_add
- elementwise_sub
- elementwise_mul
- elementwise_div
## 参考示例演示
### 测试设备(RK1808 EVB)
![rk1808_evb_front](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/rk1808_evb_front.jpg)
![rk1808_evb_back](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/rk1808_evb_back.jpg)
### 准备设备环境
- 需要依赖特定版本的firmware,请参照[rknpu_ddk](https://github.com/airockchip/rknpu_ddk)的说明对设备进行firmware的更新;
- 由于RK1808 EVB在刷firmware后,只是一个纯净的Linux系统,无法像Ubuntu那样使用apt-get命令方便的安装软件,因此,示例程序和PaddleLite库的编译均采用交叉编译方式;
- 将MicroUSB线插入到设备的MicroUSB OTG口,就可以使用Android的adb命令进行设备的交互,再也不用配置网络使用ssh或者通过串口的方式访问设备了,这个设计非常赞!
### 准备交叉编译环境
- 为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置。
### 运行图像分类示例程序
-[https://paddlelite-demo.bj.bcebos.com/devices/rockchip/PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/PaddleLite-linux-demo.tar.gz)下载示例程序,解压后清单如下:
```shell
- PaddleLite-linux-demo
- image_classification_demo
- assets
- images
- tabby_cat.jpg # 测试图片
- tabby_cat.raw # 已处理成raw数据的测试图片
- labels
- synset_words.txt # 1000分类label文件
- models
- mobilenet_v1_int8_224_for_cpu.nb # 已通过opt转好的、适合arm cpu的mobilenetv1量化模型
- mobilenet_v1_int8_224_for_rknpu.nb # 已通过opt转好的、适合rknpu的mobilenetv1量化模型
- shell
- CMakeLists.txt # 示例程序CMake脚本
- build
- image_classification_demo # 已编译好的示例程序
- image_classification_demo.cc # 示例程序源码
- convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本
- build.sh # 示例程序编译脚本
- run.sh # 示例程序运行脚本
- libs
- PaddleLite
- arm64
- include # PaddleLite头文件
- lib
- libGAL.so # RK DDK库
- libOpenVX.so
- libVSC.so
- librknpu_ddk.so
- libgomp.so.1 # gnuomp库
- libpaddle_light_api_shared.so # 预编译PaddleLite库
- armhf
- include # PaddleLite头文件
- lib
- libGAL.so
- libOpenVX.so
- libVSC.so
- librknpu_ddk.so
- libgomp.so.1
- libpaddle_light_api_shared.so
```
- 进入PaddleLite-linux-demo/image_classification_demo/shell,直接执行./run.sh arm64即可,注意:run.sh不能在docker环境执行,否则无法找到设备;
```shell
$ cd PaddleLite-linux-demo/image_classification_demo/shell
$ ./run.sh arm64 # For RK1808 EVB
$ ./run.sh armhf # For RK1806 EVB
...
warmup: 5 repeat: 10, average: 6.499500 ms, max: 6.554000 ms, min: 6.468000 ms
results: 3
Top0 Egyptian cat - 0.532328
Top1 tabby, tabby cat - 0.345136
Top2 tiger cat - 0.111146
Preprocess time: 2.414000 ms
Prediction time: 6.499500 ms
Postprocess time: 0.414000 ms
```
- 如果需要更改测试图片,可通过convert_to_raw_image.py工具生成;
- 如果需要重新编译示例程序,直接运行./build.sh即可,注意:build.sh的执行必须在docker环境中,否则可能编译出错。
### 更新模型
- 通过Paddle Fluid训练,或X2Paddle转换得到MobileNetv1 foat32模型[mobilenet_v1_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz)
- 参考[模型量化-有校准数据训练后量化](../user_guides/post_quant_with_data)使用PaddleSlim对float32模型进行量化(注意:由于RK NPU只支持tensor-wise的全量化模型,在启动量化脚本时请注意相关参数的设置),最终得到全量化MobileNetV1模型[mobilenet_v1_int8_224_fluid](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/mobilenet_v1_int8_224_fluid.tar.gz)
- 参考[模型转化方法](../user_guides/model_optimize_tool),利用opt工具转换生成RKNPU模型,仅需要将valid_targets设置为rknpu,arm即可。
```shell
$ ./opt --model_dir=mobilenet_v1_int8_224_fluid \
--optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_int8_224_for_rknpu \
--valid_targets=rknpu,arm
```
- 注意:opt生成的模型只是标记了RKNPU支持的Paddle算子,并没有真正生成RK NPU模型,只有在执行时才会将标记的Paddle算子转成RK NPU组网API,最终生成并执行模型。
### 更新支持RK NPU的Paddle Lite库
- 下载PaddleLite源码和RK DDK;
```shell
$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
$ cd Paddle-Lite
$ git checkout <release-version-tag>
$ git clone https://github.com/airockchip/rknpu_ddk.git
```
- 编译full_publish and tiny_publish for RK1808 and RK1806 EVB
```shell
For RK1808 EVB
$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
For RK1806 EVB
$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
```
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录;
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件;
- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录;
- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件。
## 其它说明
- RK研发同学正在持续增加用于适配Paddle算子bridge/converter,以便适配更多Paddle模型。
......@@ -2,15 +2,14 @@
Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../user_guides/source_compile)
(注意:非docker Linux环境需要是Ubuntu16.04)
## 编译
1、 下载代码
```bash
# 下载Paddle-Lite源码
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
# 切换到release分支
git checkout release/v2.3
git checkout release/v2.6.0
```
2、 源码编译
......@@ -18,6 +17,9 @@ git checkout release/v2.3
```bash
cd Paddle-Lite
./lite/tools/build.sh x86
# 其他可选择编译选项
# --with_log=OFF 关闭LOG信息输出
```
## 编译结果说明
......@@ -31,35 +33,68 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib`
- `include` : 头文件
- `lib` : 库文件
- 打包的静态库文件:
- `libpaddle_api_full_bundled.a` :包含 full_api 和 light_api 功能的静态库
- `libpaddle_api_light_bundled.a` :只包含 light_api 功能的静态库
- 打包的动态态库文件:
- `libpaddle_full_api_shared.so` :包含 full_api 和 light_api 功能的动态库
- `libpaddle_light_api_shared.so`:只包含 light_api 功能的动态库
- 静态库文件:
- `libpaddle_api_full_bundled.a` :full_api 静态库
- `libpaddle_api_light_bundled.a` :light_api 静态库
- 动态库文件:
- `libpaddle_full_api_shared.so` :full_api 动态库
- `libpaddle_light_api_shared.so`:light_api 动态库
3、 `third_party` 文件夹:依赖的第三方预测库mklml
- mklml : Paddle-Lite预测库依赖的mklml数学库
4、 `demo/cxx`文件夹:x86预测库的C++ 示例demo
- `mobilenetv1_full` :使用full_api 执行mobilenet_v1预测的C++ demo
- `mobilenetv1_light` :使用light_api 执行mobilenet_v1预测的C++ demo
3、 `third_party` 文件夹:第三方库文件
## x86预测API使用示例
1、我们提供Linux环境下x86 API运行mobilenet_v1的示例:[mobilenet_full_x86demo](https://paddlelite-data.bj.bcebos.com/x86/mobilenet_full_x86demo.zip)。下载解压后内容如下:
1、`mobilenetv1_full`目录结构
![](https://paddlelite-data.bj.bcebos.com/x86/x86-doc/demo.png)
```bash
mobilenetv1_full/
|-- CMakeLists.txt
|-- build.sh
`-- mobilenet_full_api.cc
```
`mobilenet_v1`为模型文件、`lib``include`分别是Paddle-Lite的预测库和头文件、`third_party`下是编译时依赖的第三方库`mklml``mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。
本demo使用cmake构建`CMakeLists.txt`为cmake脚本,`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。
2、demo内容与使用方法
2、demo使用方法
``` bash
# 1、编译
cd mobilenetv1_full
sh build.sh
```
编译结果为当前目录下的 `mobilenet_full_api `
``` bash
# 2、执行预测
mobilenet_full_api mobilenet_v1
./mobilenet_full_api ./mobilenet_v1
```
`mobilenet_v1`为当前目录下的模型路径,`mobilenet_full_api`为第一步编译出的可执行文件。
下载并解压模型[`mobilenet_v1`](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)到当前目录,执行以上命令进行预测。
```bash
# 3、执行demo后输出结果如下,全一输入下mobilenet_v1的预测结果
Output shape 1000
Output[0]: 0.000191312
Output[100]: 0.000159713
Output[200]: 0.000264313
Output[300]: 0.000210793
Output[400]: 0.00103236
Output[500]: 0.000110071
Output[600]: 0.00482924
Output[700]: 0.00184533
Output[800]: 0.000202116
Output[900]: 0.000585591
```
3、示例源码`mobilenet_full_api.cc`
......
......@@ -54,6 +54,9 @@ Welcome to Paddle-Lite's documentation!
demo_guides/opencl
demo_guides/fpga
demo_guides/npu
demo_guides/baidu_xpu
demo_guides/rockchip_npu
demo_guides/mediatek_apu
.. toctree::
:maxdepth: 1
......
......@@ -16,7 +16,6 @@ message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
message(STATUS "LITE_WITH_ARM_LANG:\t${LITE_WITH_ARM_LANG}")
set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
......@@ -188,15 +187,17 @@ if (LITE_WITH_CUDA OR LITE_WITH_X86)
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_custom_target(publish_inference_third_party ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
if (LITE_WITH_CUDA)
add_custom_target(publish_inference_third_party ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
add_dependencies(publish_inference publish_inference_third_party)
endif()
add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cxx_lib)
add_dependencies(publish_inference publish_inference_third_party)
endif()
endif()
......@@ -238,9 +239,13 @@ if (LITE_WITH_X86)
add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
)
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/mklml" "${INFER_LITE_PUBLISH_ROOT}/third_party/"
)
add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
......@@ -369,6 +374,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile"
)
add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
......
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
lite_cc_library(place SRCS paddle_place.cc DEPS logging)
else()
lite_cc_library(place SRCS paddle_place.cc DEPS glog)
......
......@@ -48,6 +48,7 @@ USE_LITE_OP(concat)
USE_LITE_OP(conv2d)
USE_LITE_OP(depthwise_conv2d)
USE_LITE_OP(pool2d)
USE_LITE_OP(max_pool2d_with_index)
USE_LITE_OP(batch_norm)
USE_LITE_OP(fusion_elementwise_sub_activation)
USE_LITE_OP(transpose)
......
......@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
// get outputnames
std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// get param names
std::vector<std::string> Predictor::GetParamNames() {
return exec_scope_->AttributeVarNames();
}
// append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() {
if (!program_) {
......@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places;
for (auto &valid_place : valid_places) {
if (valid_place.target == TARGET(kOpenCL)) continue;
inner_places.emplace_back(
Place(TARGET(kHost), valid_place.precision, valid_place.layout));
}
......@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() {
const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name);
CHECK(var) << "no variable named with " << name << " in exec_scope";
return &var->Get<lite::Tensor>();
}
lite::Tensor *Predictor::GetMutableTensor(const std::string &name) {
auto *var = exec_scope_->FindVar(name);
CHECK(var) << "no variable named with " << name << " in exec_scope";
return var->GetMutable<lite::Tensor>();
}
// get input by name
lite::Tensor *Predictor::GetInputByName(const std::string &name) {
auto element = std::find(input_names_.begin(), input_names_.end(), name);
......
......@@ -85,6 +85,9 @@ class LITE_API Predictor {
// get inputnames and get outputnames.
std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames();
// get param names
std::vector<std::string> GetParamNames();
void PrepareFeedFetch();
// Get offset-th col of fetch results.
......@@ -92,6 +95,9 @@ class LITE_API Predictor {
std::vector<const lite::Tensor*> GetOutputs() const;
const cpp::ProgramDesc& program_desc() const;
// get a mutable tensor according to its name
lite::Tensor* GetMutableTensor(const std::string& name);
// get a const tensor according to its name
const lite::Tensor* GetTensor(const std::string& name) const;
const RuntimeProgram& runtime_program() const;
......@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
// get inputs names and get outputs names
std::vector<std::string> GetInputNames() override;
std::vector<std::string> GetOutputNames() override;
// get param names
std::vector<std::string> GetParamNames() override;
// get tensor according to tensor's name
std::unique_ptr<const lite_api::Tensor> GetTensor(
const std::string& name) const override;
// get a mutable tensor according to tensor's name
std::unique_ptr<lite_api::Tensor> GetMutableTensor(
const std::string& name) override;
// Get InputTebsor by name
std::unique_ptr<lite_api::Tensor> GetInputByName(
......
......@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
return raw_predictor_.GetInputNames();
}
std::vector<std::string> CxxPaddleApiImpl::GetParamNames() {
return raw_predictor_.GetParamNames();
}
std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
return raw_predictor_.GetOutputNames();
}
......@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
}
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetMutableTensor(
const std::string &name) {
return std::unique_ptr<lite_api::Tensor>(
new lite_api::Tensor(raw_predictor_.GetMutableTensor(name)));
}
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
const std::string &name) {
return std::unique_ptr<lite_api::Tensor>(
......
......@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0");
DEFINE_string(input_shape_0,
"1,3,224,224",
"input shapes another, separated by colon and comma");
DEFINE_string(target, "arm", "main target for Predictor: arm, opencl");
DEFINE_bool(use_optimize_nb,
false,
"optimized & naive buffer model for mobile devices");
......@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
if (FLAGS_target == "arm") {
config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
} else if (FLAGS_target == "opencl") {
config.set_valid_places({
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
Place{TARGET(kARM)}, // enable kARM CPU kernel when no opencl kernel
});
}
auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model
......@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
int tid,
const int warmup_times = 5) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode);
config.set_threads(thread_num);
......@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
const int repeat,
int warmup = 5) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode);
config.set_threads(thread_num);
......@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
const int repeat,
int warmup = 5) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_model_from_file(model_dir + ".nb");
config.set_power_mode(power_mode);
config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config);
config.set_model_dir(model_dir_0);
config.set_model_from_file(model_dir_0 + ".nb");
auto predictor_0 = lite_api::CreatePaddlePredictor(config);
for (int i = 0; i < 2 * repeat; i += 2) {
......@@ -246,7 +256,8 @@ int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") {
LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model";
<< "--model_dir /path/to/your/model --model_dir_0 "
"/path/to/your/model0 --target `arm` or `opencl`";
exit(0);
}
std::string save_optimized_model_dir = "";
......
......@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model");
DEFINE_string(param_file, "", "param file path of the combined-param model");
DEFINE_string(
optimize_out_type,
"protobuf",
"naive_buffer",
"store type of the output optimized model. protobuf/naive_buffer");
DEFINE_bool(display_kernels, false, "Display kernel information");
DEFINE_bool(record_tailoring_info,
......@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
}
std::cout << std::setiosflags(std::ios::internal);
std::cout << std::setw(maximum_optype_length) << "OP_name";
for (int i = 0; i < targets.size(); i++) {
for (size_t i = 0; i < targets.size(); i++) {
std::cout << std::setw(10) << targets[i].substr(1);
}
std::cout << std::endl;
......@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
std::cout << std::setw(maximum_optype_length) << it->first;
auto ops_valid_places = it->second;
for (int i = 0; i < targets.size(); i++) {
for (size_t i = 0; i < targets.size(); i++) {
if (std::find(ops_valid_places.begin(),
ops_valid_places.end(),
targets[i]) != ops_valid_places.end()) {
......@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
}
// Print OP info.
auto ops_valid_places = supported_ops.at(*op);
for (int i = 0; i < targets.size(); i++) {
for (size_t i = 0; i < targets.size(); i++) {
if (std::find(ops_valid_places.begin(),
ops_valid_places.end(),
targets[i]) != ops_valid_places.end()) {
......@@ -288,11 +288,11 @@ void ParseInputCommand() {
auto valid_places = paddle::lite_api::ParserValidPlaces();
// get valid_targets string
std::vector<TargetType> target_types = {};
for (int i = 0; i < valid_places.size(); i++) {
for (size_t i = 0; i < valid_places.size(); i++) {
target_types.push_back(valid_places[i].target);
}
std::string targets_str = TargetToStr(target_types[0]);
for (int i = 1; i < target_types.size(); i++) {
for (size_t i = 1; i < target_types.size(); i++) {
targets_str = targets_str + TargetToStr(target_types[i]);
}
......@@ -301,7 +301,7 @@ void ParseInputCommand() {
target_types.push_back(TARGET(kUnk));
std::set<std::string> valid_ops;
for (int i = 0; i < target_types.size(); i++) {
for (size_t i = 0; i < target_types.size(); i++) {
auto ops = supported_ops_target[static_cast<int>(target_types[i])];
valid_ops.insert(ops.begin(), ops.end());
}
......@@ -318,7 +318,7 @@ void CheckIfModelSupported() {
auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
valid_ops.insert(
valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
for (int i = 0; i < valid_places.size(); i++) {
for (size_t i = 0; i < valid_places.size(); i++) {
auto target = valid_places[i].target;
auto ops = supported_ops_target[static_cast<int>(target)];
valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
......@@ -340,7 +340,7 @@ void CheckIfModelSupported() {
std::set<std::string> unsupported_ops;
std::set<std::string> input_model_ops;
for (int index = 0; index < cpp_prog.BlocksSize(); index++) {
for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
for (size_t i = 0; i < current_block->OpsSize(); ++i) {
auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
......@@ -364,13 +364,13 @@ void CheckIfModelSupported() {
unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
}
std::vector<TargetType> targets = {};
for (int i = 0; i < valid_places.size(); i++) {
for (size_t i = 0; i < valid_places.size(); i++) {
targets.push_back(valid_places[i].target);
}
std::sort(targets.begin(), targets.end());
targets.erase(unique(targets.begin(), targets.end()), targets.end());
std::string targets_str = TargetToStr(targets[0]);
for (int i = 1; i < targets.size(); i++) {
for (size_t i = 1; i < targets.size(); i++) {
targets_str = targets_str + "," + TargetToStr(targets[i]);
}
......
......@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
"command argument 'valid_targets'";
}
void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
optimize_out_path_ = optimized_out_path;
void OptBase::SetLiteOut(const std::string& lite_out_name) {
lite_out_name_ = lite_out_name;
}
void OptBase::RunOptimize(bool record_strip_info) {
void OptBase::RecordModelInfo(bool record_strip_info) {
record_strip_info_ = record_strip_info;
}
void OptBase::Run() {
CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info);
RunOptimizeFromModelSet(record_strip_info_);
} else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info);
lite_out_name_, model_type_, record_strip_info_);
auto resulted_model_name =
record_strip_info ? "information of striped model" : "optimized model";
record_strip_info_ ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name
<< " into :" << optimize_out_path_ << "successfully";
<< " into :" << lite_out_name_ << "successfully";
}
}
void OptBase::RunOptimize(const std::string& model_dir_path,
const std::string& model_path,
const std::string& param_path,
const std::string& valid_places,
const std::string& optimized_out_path) {
SetModelDir(model_dir_path);
SetModelFile(model_path);
SetParamFile(param_path);
SetValidPlaces(valid_places);
SetLiteOut(optimized_out_path);
CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info_);
} else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
lite_out_name_, model_type_, record_strip_info_);
auto resulted_model_name =
record_strip_info_ ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name
<< " into :" << lite_out_name_ << "successfully";
}
}
// collect ops info of modelset
void CollectModelMetaInfo(const std::string& output_dir,
const std::vector<std::string>& models,
......@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) {
}
void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
// 1. mkdir of outputed optimized model set.
lite::MkDirRecur(optimize_out_path_);
lite::MkDirRecur(lite_out_name_);
auto model_dirs = lite::ListDir(model_set_dir_, true);
if (model_dirs.size() == 0) {
LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
......@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
std::string input_model_dir =
lite::Join<std::string>({model_set_dir_, name}, "/");
std::string output_model_dir =
lite::Join<std::string>({optimize_out_path_, name}, "/");
lite::Join<std::string>({lite_out_name_, name}, "/");
if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
auto model_file_path =
......@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info);
lite_out_name_, model_type_, record_strip_info);
std::cout << "Optimize done. ";
}
......@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
if (record_strip_info) {
// Collect all models information
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo(optimize_out_path_,
model_dirs,
lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
std::cout << "Record the information of stripped models into :"
<< optimize_out_path_ << "successfully";
<< lite_out_name_ << "successfully";
}
}
void OptBase::PrintHelpInfo() {
const std::string opt_version = lite::version();
const char help_info[] =
"At least one argument should be inputed. Valid arguments are listed "
"below:\n"
"------------------------------------------------------------------------"
"-----------------------------------------------------------\n"
" Valid arguments of Paddle-Lite opt are listed below:\n"
"------------------------------------------------------------------------"
"-----------------------------------------------------------\n"
" Arguments of help information:\n"
" `help()` Print help infomation\n"
" Arguments of model optimization:\n"
"\n"
" Arguments of model transformation:\n"
" `set_model_dir(model_dir)`\n"
" `set_model_file(model_file_path)`\n"
" `set_param_file(param_file_path)`\n"
" `set_model_type(protobuf|naive_buffer)`\n"
" `set_optimize_out(output_optimize_model_dir)`\n"
" `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
"default\n"
" `set_lite_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" `run_optimize(false|true)`\n"
" ` ----fasle&true refer to whether to record ops info for "
"tailoring lib, false by default`\n"
" Arguments of model checking and ops information:\n"
" `record_model_info(false|true)`: refer to whether to record ops "
"info for striping lib, false by default`\n"
" `run() : start model transformation`\n"
" eg. `opt.set_model_dir(\"./mobilenetv1\"); "
"opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); "
"opt.run();`\n"
"\n"
" You can also transform model through a single input argument:\n"
" `run_optimize(model_dir, model_file_path, param_file_path, "
"model_type, valid_places, lite_out_name) `\n"
" eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", "
"\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`"
"\n"
" Arguments of checking model and printing ops information:\n"
" `print_all_ops()` Display all the valid operators of "
"Paddle-Lite\n"
" `print_supported_ops` Display supported operators of valid "
"places\n"
" `check_if_model_supported()` Check if the input model is "
"supported\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
"supported\n"
"------------------------------------------------------------------------"
"-----------------------------------------------------------\n";
std::cout << "opt version:" << opt_version << std::endl << help_info;
}
// 2. Print supported info of inputed ops
void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
......
......@@ -44,16 +44,21 @@ class LITE_API OptBase {
public:
OptBase() = default;
void SetModelSetDir(const std::string &model_set_path);
void SetModelDir(const std::string &model_path);
void SetModelDir(const std::string &model_dir_path);
void SetModelFile(const std::string &model_path);
void SetParamFile(const std::string &param_path);
void SetValidPlaces(const std::string &valid_places);
void SetOptimizeOut(const std::string &optimized_out_path);
void SetLiteOut(const std::string &lite_out_name);
void RecordModelInfo(bool record_strip_info = true);
// set optimized_model type
void SetModelType(std::string model_type);
// transform and save the optimized model
void RunOptimize(bool record_strip_info = false);
void Run();
void RunOptimize(const std::string &model_dir_path = "",
const std::string &model_path = "",
const std::string &param_path = "",
const std::string &valid_places = "",
const std::string &optimized_out_path = "");
// fuctions of printing info
// 1. help info
void PrintHelpInfo();
......@@ -71,12 +76,12 @@ class LITE_API OptBase {
// valid places for the optimized_model
std::vector<Place> valid_places_;
// filename of the optimized_model
std::string optimize_out_path_;
std::string lite_out_name_;
// type of the optimized_model, kNaiveBuffer default.
LiteModelType model_type_{LiteModelType::kNaiveBuffer};
// Dir path of a set of models, this should be combined with model
std::string model_set_dir_;
bool record_strip_info_{false};
void RunOptimizeFromModelSet(bool record_strip_info = false);
};
......
......@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }
std::unique_ptr<Tensor> PaddlePredictor::GetMutableTensor(
const std::string &name) {
LOG(FATAL)
<< "The GetMutableTensor API is only supported by CxxConfig predictor.";
return nullptr;
}
std::vector<std::string> PaddlePredictor::GetParamNames() {
std::vector<std::string> null_result = {};
LOG(FATAL)
<< "The GetParamNames API is only supported by CxxConfig predictor.";
return null_result;
}
void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
LiteModelType model_type,
bool record_info) {
......
......@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor {
virtual std::vector<std::string> GetInputNames() = 0;
// Get output names
virtual std::vector<std::string> GetOutputNames() = 0;
// Get output names
virtual std::vector<std::string> GetParamNames();
// Get Input by name
virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
......@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor {
/// Get a readonly tensor, return null if no one called `name` exists.
virtual std::unique_ptr<const Tensor> GetTensor(
const std::string& name) const = 0;
/// Get a mutable tensor, return null if on one called `name` exists
/// internal infereces API, not recommanded.
virtual std::unique_ptr<Tensor> GetMutableTensor(const std::string& name);
/// Persist the optimized model to disk. This API is only supported by
/// CxxConfig, and the persisted model can be reused for MobileConfig.
......@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase {
#endif
#ifdef LITE_WITH_CUDA
void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
int multi_stream() const { return multi_stream_; }
bool multi_stream() const { return multi_stream_; }
#endif
#ifdef LITE_WITH_MLU
......@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase {
// current thread.
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
// XPU only, specify the target device ID for the current thread.
// **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
// thread
void set_xpu_dev_per_thread(int dev_no = 0);
};
......
......@@ -19,7 +19,13 @@
#pragma once
// some platform-independent defintion
#include "lite/utils/macros.h"
#if defined(_WIN32)
#define UNUSED
#define __builtin_expect(EXP, C) (EXP)
#else
#define UNUSED __attribute__((unused))
#endif
#define USE_LITE_OP(op_type__) \
extern int touch_op_##op_type__(); \
......
......@@ -161,6 +161,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kBM),
TARGET(kMLU),
TARGET(kAPU),
TARGET(kRKNPU),
TARGET(kFPGA)});
if (target == TARGET(kAny)) {
return valid_set;
......
......@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
USE_MIR_PASS(lite_interpolate_fuse_pass);
USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
USE_MIR_PASS(identity_scale_eliminate_pass);
USE_MIR_PASS(identity_dropout_eliminate_pass);
USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
USE_MIR_PASS(lite_conv_activation_fuse_pass);
USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
......@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
USE_MIR_PASS(__xpu__fc_fuse_pass);
......@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) {
.def("set_model_file", &OptBase::SetModelFile)
.def("set_param_file", &OptBase::SetParamFile)
.def("set_valid_places", &OptBase::SetValidPlaces)
.def("set_optimize_out", &OptBase::SetOptimizeOut)
.def("set_lite_out", &OptBase::SetLiteOut)
.def("set_model_type", &OptBase::SetModelType)
.def("record_model_info", &OptBase::RecordModelInfo)
.def("run", &OptBase::Run)
.def("run_optimize", &OptBase::RunOptimize)
.def("help", &OptBase::PrintHelpInfo)
.def("print_supported_ops", &OptBase::PrintSupportedOps)
......
......@@ -33,11 +33,17 @@ else:
PADDLELITE_VERSION = PADDLELITE_TAG
# core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
files = os.listdir('${PADDLE_BINARY_DIR}')
INFERENCE_LITE_LIB_PATH = ''
for file in files:
if file.find('inference_lite_lib') == 0:
INFERENCE_LITE_LIB_PATH = '${PADDLE_BINARY_DIR}/' + file
break
LITE_PATH = INFERENCE_LITE_LIB_PATH + '/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
# put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
LIB_PATH = INFERENCE_LITE_LIB_PATH + '/python/install/libs/'
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
......@@ -49,8 +55,7 @@ if '${WITH_MKL}' == 'ON':
PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
# link lite.so to paddlelite.libs
if os.name != 'nt':
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so"
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' " + LITE_PATH + "/lite.so"
if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
......
......@@ -15,6 +15,7 @@
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <fstream>
#include <thread> //NOLINT
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
......@@ -30,14 +31,18 @@ DEFINE_string(input_img_txt_path,
namespace paddle {
namespace lite {
void TestModel(const std::vector<Place>& valid_places) {
const int g_batch_size = 1;
const int g_thread_num = 1;
void instance_run() {
lite::Predictor predictor;
std::vector<std::string> passes;
std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(
std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
input_tensor->Resize(DDim(std::vector<DDim::value_type>(
{g_batch_size, 3, FLAGS_im_height, FLAGS_im_width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
if (FLAGS_input_img_txt_path.empty()) {
......@@ -45,12 +50,15 @@ void TestModel(const std::vector<Place>& valid_places) {
data[i] = 1;
}
} else {
std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
if (!fs.is_open()) {
LOG(FATAL) << "open input_img_txt error.";
}
for (int i = 0; i < item_size; i++) {
fs >> data[i];
for (int j = 0; j < g_batch_size; j++) {
std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
if (!fs.is_open()) {
LOG(FATAL) << "open input_img_txt error.";
}
for (int i = 0; i < item_size / g_batch_size; i++) {
fs >> data[i];
}
data += j * item_size / g_batch_size;
}
}
for (int i = 0; i < FLAGS_warmup; ++i) {
......@@ -72,6 +80,7 @@ void TestModel(const std::vector<Place>& valid_places) {
FILE* fp = fopen("result.txt", "wb");
for (int i = 0; i < out.size(); i++) {
auto* out_data = out[i]->data<float>();
LOG(INFO) << out[i]->numel();
for (int j = 0; j < out[i]->numel(); j++) {
fprintf(fp, "%f\n", out_data[j]);
}
......@@ -79,6 +88,16 @@ void TestModel(const std::vector<Place>& valid_places) {
fclose(fp);
}
void TestModel(const std::vector<Place>& valid_places) {
std::vector<std::unique_ptr<std::thread>> instances_vec;
for (int i = 0; i < g_thread_num; ++i) {
instances_vec.emplace_back(new std::thread(&instance_run));
}
for (int i = 0; i < g_thread_num; ++i) {
instances_vec[i]->join();
}
}
TEST(Classify, test_bm) {
std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
......
......@@ -2,4 +2,5 @@ if(NOT LITE_WITH_APU)
return()
endif()
lite_cc_library(device_apu SRCS device.cc)
lite_cc_library(neuron_adapter SRCS neuron_adapter.cc)
lite_cc_library(device_apu SRCS device.cc DEPS neuron_adapter)
......@@ -20,48 +20,19 @@ namespace paddle {
namespace lite {
namespace apu {
inline void* LoadFunc(void* libHandle, const char* name) {
CHECK(libHandle != nullptr);
CHECK(name != nullptr);
void* fn = dlsym(libHandle, name);
if (fn == nullptr) {
LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
<< "] Because " << dlerror();
}
return fn;
}
NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) {
typedef int (*NeuronCompilation_create)(NeuronModel * model,
NeuronCompilation * *compilation);
typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation);
typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation);
#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
FUNC_NAME VARIABLE_NAME = \
reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create)
LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free)
LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish)
#undef LOAD_FUNCTIONS
int neuron_errCode = 0;
NeuronCompilation* compilation = NULL;
NeuronCompilation* Device::Build(NeuronModel* model) {
VLOG(3) << "[APU] Compile model";
neuron_errCode = (*neuron_compilation_create)(model, &compilation);
NeuronCompilation* compilation = NULL;
int neuron_errCode = NeuronCompilation_create(model, &compilation);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode;
return nullptr;
}
neuron_errCode = (*neuron_compilation_finish)(compilation);
neuron_errCode = NeuronCompilation_finish(compilation);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "[APU] compile failed! " << neuron_errCode;
return nullptr;
}
VLOG(3) << "[APU] Build done";
return compilation;
}
......
......@@ -18,7 +18,7 @@
#include <string>
#include <unordered_map>
#include <vector>
#include "NeuronAdapter.h" // NOLINT
#include "lite/backends/apu/neuron_adapter.h"
namespace paddle {
namespace lite {
......@@ -32,7 +32,7 @@ class Device {
}
Device() {}
NeuronCompilation* Build(void* libHandle, NeuronModel* model);
NeuronCompilation* Build(NeuronModel* model);
};
} // namespace apu
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/apu/neuron_adapter.h"
#include <dlfcn.h>
#include <string>
#include <vector>
namespace paddle {
namespace lite {
NeuronAdapter* NeuronAdapter::Global() {
static NeuronAdapter adapter;
return &adapter;
}
NeuronAdapter::NeuronAdapter() {
CHECK(InitHandle()) << "Fail to initialize the Neuron Adapter library!";
InitFunctions();
}
bool NeuronAdapter::InitHandle() {
const std::vector<std::string> paths = {
"libneuron_adapter.so",
#if defined(__aarch64__)
"/vendor/lib64/libneuron_adapter.so",
"/system/lib64/libneuron_adapter.so",
"/system/vendor/lib64/libneuron_adapter.so",
#else
"/vendor/lib/libneuron_adapter.so",
"/system/lib/libneuron_adapter.so",
"/system/vendor/lib/libneuron_adapter.so",
#endif
};
std::string target_lib = "Unknown";
for (auto path : paths) {
handle_ = dlopen(path.c_str(), RTLD_LAZY);
if (handle_ != nullptr) {
target_lib = path;
break;
}
}
VLOG(4) << "Load the Neuron Adapter library from " << target_lib;
if (handle_ != nullptr) {
return true;
} else {
return false;
}
}
void NeuronAdapter::InitFunctions() {
CHECK(handle_ != nullptr) << "The library handle can't be null!";
#define PADDLE_DLSYM(neuron_adapter_func) \
do { \
neuron_adapter_func##_ = \
(neuron_adapter_func##_Type)dlsym(handle_, #neuron_adapter_func); \
if (neuron_adapter_func##_ == nullptr) { \
LOG(FATAL) << "Cannot find the " << #neuron_adapter_func \
<< " symbol in libneuron_adapter.so!"; \
break; \
} \
VLOG(4) << "Loaded the " << #neuron_adapter_func \
<< " symbol successfully."; \
} while (false)
PADDLE_DLSYM(Neuron_getVersion);
PADDLE_DLSYM(NeuronModel_create);
PADDLE_DLSYM(NeuronModel_free);
PADDLE_DLSYM(NeuronModel_finish);
PADDLE_DLSYM(NeuronModel_addOperand);
PADDLE_DLSYM(NeuronModel_setOperandValue);
PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
PADDLE_DLSYM(NeuronModel_addOperation);
PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
PADDLE_DLSYM(NeuronCompilation_create);
PADDLE_DLSYM(NeuronCompilation_free);
PADDLE_DLSYM(NeuronCompilation_finish);
PADDLE_DLSYM(NeuronExecution_create);
PADDLE_DLSYM(NeuronExecution_free);
PADDLE_DLSYM(NeuronExecution_setInput);
PADDLE_DLSYM(NeuronExecution_setOutput);
PADDLE_DLSYM(NeuronExecution_compute);
#undef PADDLE_DLSYM
}
} // namespace lite
} // namespace paddle
int Neuron_getVersion(uint32_t* version) {
return paddle::lite::NeuronAdapter::Global()->Neuron_getVersion()(version);
}
int NeuronModel_create(NeuronModel** model) {
return paddle::lite::NeuronAdapter::Global()->NeuronModel_create()(model);
}
void NeuronModel_free(NeuronModel* model) {
return paddle::lite::NeuronAdapter::Global()->NeuronModel_free()(model);
}
int NeuronModel_finish(NeuronModel* model) {
return paddle::lite::NeuronAdapter::Global()->NeuronModel_finish()(model);
}
int NeuronModel_addOperand(NeuronModel* model, const NeuronOperandType* type) {
return paddle::lite::NeuronAdapter::Global()->NeuronModel_addOperand()(model,
type);
}
int NeuronModel_setOperandValue(NeuronModel* model,
int32_t index,
const void* buffer,
size_t length) {
return paddle::lite::NeuronAdapter::Global()->NeuronModel_setOperandValue()(
model, index, buffer, length);
}
int NeuronModel_setOperandSymmPerChannelQuantParams(
NeuronModel* model,
int32_t index,
const NeuronSymmPerChannelQuantParams* channelQuant) {
return paddle::lite::NeuronAdapter::Global()
->NeuronModel_setOperandSymmPerChannelQuantParams()(
model, index, channelQuant);
}
int NeuronModel_addOperation(NeuronModel* model,
NeuronOperationType type,
uint32_t inputCount,
const uint32_t* inputs,
uint32_t outputCount,
const uint32_t* outputs) {
return paddle::lite::NeuronAdapter::Global()->NeuronModel_addOperation()(
model, type, inputCount, inputs, outputCount, outputs);
}
int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
uint32_t inputCount,
const uint32_t* inputs,
uint32_t outputCount,
const uint32_t* outputs) {
return paddle::lite::NeuronAdapter::Global()
->NeuronModel_identifyInputsAndOutputs()(
model, inputCount, inputs, outputCount, outputs);
}
int NeuronCompilation_create(NeuronModel* model,
NeuronCompilation** compilation) {
return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_create()(
model, compilation);
}
void NeuronCompilation_free(NeuronCompilation* compilation) {
return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_free()(
compilation);
}
int NeuronCompilation_finish(NeuronCompilation* compilation) {
return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_finish()(
compilation);
}
int NeuronExecution_create(NeuronCompilation* compilation,
NeuronExecution** execution) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
compilation, execution);
}
void NeuronExecution_free(NeuronExecution* execution) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_free()(
execution);
}
int NeuronExecution_setInput(NeuronExecution* execution,
int32_t index,
const NeuronOperandType* type,
const void* buffer,
size_t length) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_setInput()(
execution, index, type, buffer, length);
}
int NeuronExecution_setOutput(NeuronExecution* execution,
int32_t index,
const NeuronOperandType* type,
void* buffer,
size_t length) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_setOutput()(
execution, index, type, buffer, length);
}
int NeuronExecution_compute(NeuronExecution* execution) {
return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
execution);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "NeuronAdapter.h" // NOLINT
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
class NeuronAdapter final {
public:
static NeuronAdapter *Global();
// Platform APIs
using Neuron_getVersion_Type = int (*)(uint32_t *);
using NeuronModel_create_Type = int (*)(NeuronModel **);
using NeuronModel_free_Type = void (*)(NeuronModel *);
using NeuronModel_finish_Type = int (*)(NeuronModel *);
using NeuronModel_addOperand_Type = int (*)(NeuronModel *,
const NeuronOperandType *);
using NeuronModel_setOperandValue_Type = int (*)(NeuronModel *,
int32_t,
const void *,
size_t);
using NeuronModel_setOperandSymmPerChannelQuantParams_Type =
int (*)(NeuronModel *, int32_t, const NeuronSymmPerChannelQuantParams *);
using NeuronModel_addOperation_Type = int (*)(NeuronModel *,
NeuronOperationType,
uint32_t,
const uint32_t *,
uint32_t,
const uint32_t *);
using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
using NeuronCompilation_create_Type = int (*)(NeuronModel *,
NeuronCompilation **);
using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
NeuronExecution **);
using NeuronExecution_free_Type = void (*)(NeuronExecution *);
using NeuronExecution_setInput_Type = int (*)(NeuronExecution *,
int32_t,
const NeuronOperandType *,
const void *,
size_t);
using NeuronExecution_setOutput_Type = int (*)(
NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
Neuron_getVersion_Type Neuron_getVersion() {
CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
return Neuron_getVersion_;
}
NeuronModel_create_Type NeuronModel_create() {
CHECK(NeuronModel_create_ != nullptr) << "Cannot load NeuronModel_create!";
return NeuronModel_create_;
}
NeuronModel_free_Type NeuronModel_free() {
CHECK(NeuronModel_free_ != nullptr) << "Cannot load NeuronModel_free!";
return NeuronModel_free_;
}
NeuronModel_finish_Type NeuronModel_finish() {
CHECK(NeuronModel_finish_ != nullptr) << "Cannot load NeuronModel_finish!";
return NeuronModel_finish_;
}
NeuronModel_addOperand_Type NeuronModel_addOperand() {
CHECK(NeuronModel_addOperand_ != nullptr)
<< "Cannot load NeuronModel_addOperand!";
return NeuronModel_addOperand_;
}
NeuronModel_setOperandValue_Type NeuronModel_setOperandValue() {
CHECK(NeuronModel_setOperandValue_ != nullptr)
<< "Cannot load NeuronModel_setOperandValue!";
return NeuronModel_setOperandValue_;
}
NeuronModel_setOperandSymmPerChannelQuantParams_Type
NeuronModel_setOperandSymmPerChannelQuantParams() {
CHECK(NeuronModel_setOperandSymmPerChannelQuantParams_ != nullptr)
<< "Cannot load NeuronModel_setOperandSymmPerChannelQuantParams!";
return NeuronModel_setOperandSymmPerChannelQuantParams_;
}
NeuronModel_addOperation_Type NeuronModel_addOperation() {
CHECK(NeuronModel_addOperation_ != nullptr)
<< "Cannot load NeuronModel_addOperation!";
return NeuronModel_addOperation_;
}
NeuronModel_identifyInputsAndOutputs_Type
NeuronModel_identifyInputsAndOutputs() {
CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
<< "Cannot load NeuronModel_identifyInputsAndOutputs!";
return NeuronModel_identifyInputsAndOutputs_;
}
NeuronCompilation_create_Type NeuronCompilation_create() {
CHECK(NeuronCompilation_create_ != nullptr)
<< "Cannot load NeuronCompilation_create!";
return NeuronCompilation_create_;
}
NeuronCompilation_free_Type NeuronCompilation_free() {
CHECK(NeuronCompilation_free_ != nullptr)
<< "Cannot load NeuronCompilation_free!";
return NeuronCompilation_free_;
}
NeuronCompilation_finish_Type NeuronCompilation_finish() {
CHECK(NeuronCompilation_finish_ != nullptr)
<< "Cannot load NeuronCompilation_finish!";
return NeuronCompilation_finish_;
}
NeuronExecution_create_Type NeuronExecution_create() {
CHECK(NeuronExecution_create_ != nullptr)
<< "Cannot load NeuronExecution_create!";
return NeuronExecution_create_;
}
NeuronExecution_free_Type NeuronExecution_free() {
CHECK(NeuronExecution_free_ != nullptr)
<< "Cannot load NeuronExecution_free!";
return NeuronExecution_free_;
}
NeuronExecution_setInput_Type NeuronExecution_setInput() {
CHECK(NeuronExecution_setInput_ != nullptr)
<< "Cannot loadcl NeuronExecution_setInput!";
return NeuronExecution_setInput_;
}
NeuronExecution_setOutput_Type NeuronExecution_setOutput() {
CHECK(NeuronExecution_setOutput_ != nullptr)
<< "Cannot load NeuronExecution_setOutput!";
return NeuronExecution_setOutput_;
}
NeuronExecution_compute_Type NeuronExecution_compute() {
CHECK(NeuronExecution_compute_ != nullptr)
<< "Cannot load NeuronExecution_compute!";
return NeuronExecution_compute_;
}
private:
NeuronAdapter();
NeuronAdapter(const NeuronAdapter &) = delete;
NeuronAdapter &operator=(const NeuronAdapter &) = delete;
bool InitHandle();
void InitFunctions();
void *handle_{nullptr};
Neuron_getVersion_Type Neuron_getVersion_{nullptr};
NeuronModel_create_Type NeuronModel_create_{nullptr};
NeuronModel_free_Type NeuronModel_free_{nullptr};
NeuronModel_finish_Type NeuronModel_finish_{nullptr};
NeuronModel_addOperand_Type NeuronModel_addOperand_{nullptr};
NeuronModel_setOperandValue_Type NeuronModel_setOperandValue_{nullptr};
NeuronModel_setOperandSymmPerChannelQuantParams_Type
NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
NeuronModel_identifyInputsAndOutputs_Type
NeuronModel_identifyInputsAndOutputs_{nullptr};
NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
NeuronExecution_create_Type NeuronExecution_create_{nullptr};
NeuronExecution_free_Type NeuronExecution_free_{nullptr};
NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
};
} // namespace lite
} // namespace paddle
......@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
const operators::ConvParam& param,
ARMContext* ctx) {
auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2];
const int pad_h0 = (*param.paddings)[0];
const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
......@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
int tile_h = (hout + 5) / 6;
int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2;
int h_pad = hin + pad_h * 2;
int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT
......@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride,
i * 4,
-pad_h,
hin + pad_h,
-pad_w,
win + pad_w,
-pad_h0,
hin + pad_h1,
-pad_w0,
win + pad_w1,
chin,
win,
hin,
......@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
const operators::ConvParam& param,
ARMContext* ctx) {
auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2];
const int pad_h0 = (*param.paddings)[0];
const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
......@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
int tile_h = (hout + 1) / 2;
int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2;
int h_pad = hin + pad_h * 2;
int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT
......@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride,
i * 4,
-pad_h,
hin + pad_h,
-pad_w,
win + pad_w,
-pad_h0,
hin + pad_h1,
-pad_w0,
win + pad_w1,
chin,
win,
hin,
......@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
const operators::ConvParam& param,
ARMContext* ctx) {
auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2];
const int pad_h0 = (*param.paddings)[0];
const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
......@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
int tile_h = (hout + 1) / 2;
int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2;
int h_pad = hin + pad_h * 2;
int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT
......@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride,
i * 4,
-pad_h,
hin + pad_h,
-pad_w,
win + pad_w,
-pad_h0,
hin + pad_h1,
-pad_w0,
win + pad_w1,
chin,
win,
hin,
......
......@@ -21,6 +21,17 @@ namespace paddle {
namespace lite {
namespace arm {
namespace math {
int AdaptStartIndex(int ph, int input_size, int output_size) {
return static_cast<int>(
floor(static_cast<double>(ph * input_size) / output_size));
}
int AdaptEndIndex(int ph, int input_size, int output_size) {
return static_cast<int>(
ceil(static_cast<double>((ph + 1) * input_size) / output_size));
}
void pooling_basic(const float* din,
float* dout,
int num,
......@@ -88,15 +99,27 @@ void pooling_basic(const float* din,
#pragma omp parallel for
for (int ind_c = 0; ind_c < chin; ++ind_c) {
for (int ind_h = 0; ind_h < hout; ++ind_h) {
int sh = ind_h * stride_h;
int eh = sh + kernel_h;
sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
eh = (eh - pad_h) > hin ? hin : eh - pad_h;
int sh, eh;
if (adaptive) {
sh = AdaptStartIndex(ind_h, hin, hout);
eh = AdaptEndIndex(ind_h, hin, hout);
} else {
sh = ind_h * stride_h;
eh = sh + kernel_h;
sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
eh = (eh - pad_h) > hin ? hin : eh - pad_h;
}
for (int ind_w = 0; ind_w < wout; ++ind_w) {
int sw = ind_w * stride_w;
int ew = sw + kernel_w;
sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
ew = (ew - pad_w) > win ? win : ew - pad_w;
int sw, ew;
if (adaptive) {
sw = AdaptStartIndex(ind_w, win, wout);
ew = AdaptEndIndex(ind_w, win, wout);
} else {
sw = ind_w * stride_w;
ew = sw + kernel_w;
sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
ew = (ew - pad_w) > win ? win : ew - pad_w;
}
float result = static_cast<float>(0);
int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
ind_h * wout + ind_w;
......
此差异已折叠。
......@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) {
template <typename T>
void scale(const T* din, T* dout, int num, T scale, T bias);
template <typename T>
void scale_relu(const T* din, T* dout, int num, T scale, T bias);
template <typename T>
void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha);
template <typename T>
void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha);
template <typename T>
void scale(const T* din,
T* dout,
......
......@@ -24,16 +24,17 @@ std::map<int, void*> TargetWrapperBM::bm_hds_;
size_t TargetWrapperBM::num_devices() {
int count = 0;
bm_dev_getcount(&count);
bm_status_t ret = bm_dev_getcount(&count);
CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
<< static_cast<int>(ret);
return count;
}
int TargetWrapperBM::GetDevice() { return device_id_; }
void TargetWrapperBM::SetDevice(int id) {
/*
if (id < 0 || (size_t)id >= num_devices()) {
LOG(FATAL) << "Failed with invalid device id " << id;
}
*/
if (id < 0 || (size_t)id >= num_devices()) {
LOG(FATAL) << "Failed with invalid device id " << id;
}
device_id_ = id;
if (bm_hds_.find(id) == bm_hds_.end()) {
bm_handle_t bm_handle;
......
......@@ -31,6 +31,7 @@ class TargetWrapper<TARGET(kBM)> {
static size_t maximum_stream() { return 0; }
static void SetDevice(int id);
static int GetDevice();
static void CreateStream(stream_t* stream) {}
static void DestroyStream(const stream_t& stream) {}
......
......@@ -129,6 +129,26 @@ bool CLRuntime::InitializePlatform() {
return true;
}
GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) {
const std::string kMALI_PATTERN_STR = "Mali";
const std::string kADRENO_PATTERN_STR = "QUALCOMM Adreno(TM)";
const std::string kPOWERVR_PATTERN_STR = "PowerVR";
if (device_name == kADRENO_PATTERN_STR) {
LOG(INFO) << "adreno gpu";
return GpuType::QUALCOMM_ADRENO;
} else if (device_name.find(kMALI_PATTERN_STR) != std::string::npos) {
LOG(INFO) << "mali gpu";
return GpuType::ARM_MALI;
} else if (device_name.find(kPOWERVR_PATTERN_STR) != std::string::npos) {
LOG(INFO) << "powerVR gpu";
return GpuType::IMAGINATION_POWERVR;
} else {
LOG(INFO) << "others gpu";
return GpuType::UNKNOWN;
}
}
bool CLRuntime::InitializeDevice() {
// ===================== BASIC =====================
// CL_DEVICE_TYPE_GPU
......@@ -148,6 +168,7 @@ bool CLRuntime::InitializeDevice() {
auto device_name = device_->getInfo<CL_DEVICE_NAME>();
LOG(INFO) << "Using device: " << device_name;
gpu_type_ = ParseGpuTypeFromDeviceName(device_name);
cl_device_type device_type = device_->getInfo<CL_DEVICE_TYPE>();
auto device_type_to_str = [](cl_device_type t) -> std::string {
......@@ -296,5 +317,53 @@ std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
return device_info_;
}
void CLRuntime::GetAdrenoContextProperties(
std::vector<cl_context_properties>* properties,
GPUPerfMode gpu_perf_mode,
GPUPriorityLevel gpu_priority_level) {
CHECK(properties) << "cl_context_properties is nullptr";
properties->reserve(5);
switch (gpu_perf_mode) {
case GPUPerfMode::PERF_LOW:
LOG(INFO) << "GPUPerfMode::PERF_LOW";
properties->push_back(CL_CONTEXT_PERF_MODE_QCOM);
properties->push_back(CL_PERF_MODE_LOW_QCOM);
break;
case GPUPerfMode::PERF_NORMAL:
LOG(INFO) << "GPUPerfMode::PERF_NORMAL";
properties->push_back(CL_CONTEXT_PERF_MODE_QCOM);
properties->push_back(CL_PERF_MODE_NORMAL_QCOM);
break;
case GPUPerfMode::PERF_HIGH:
LOG(INFO) << "GPUPerfMode::PERF_HIGH";
properties->push_back(CL_CONTEXT_PERF_MODE_QCOM);
properties->push_back(CL_PERF_MODE_HIGH_QCOM);
break;
default:
break;
}
switch (gpu_priority_level) {
case GPUPriorityLevel::PRIORITY_LOW:
LOG(INFO) << "GPUPriorityLevel::PRIORITY_LOW";
properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM);
properties->push_back(CL_PRIORITY_HINT_LOW_QCOM);
break;
case GPUPriorityLevel::PRIORITY_NORMAL:
LOG(INFO) << "GPUPriorityLevel::PRIORITY_NORMAL";
properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM);
properties->push_back(CL_PRIORITY_HINT_NORMAL_QCOM);
break;
case GPUPriorityLevel::PRIORITY_HIGH:
LOG(INFO) << "GPUPriorityLevel::PRIORITY_HIGH";
properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM);
properties->push_back(CL_PRIORITY_HINT_HIGH_QCOM);
break;
default:
break;
}
// The properties list should be terminated with 0
properties->push_back(0);
}
} // namespace lite
} // namespace paddle
......@@ -19,6 +19,45 @@ limitations under the License. */
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_utility.h"
typedef enum {
UNKNOWN = 0,
QUALCOMM_ADRENO = 1,
ARM_MALI = 2,
IMAGINATION_POWERVR = 3,
OTHERS = 4,
} GpuType;
typedef enum {
PERF_DEFAULT = 0,
PERF_LOW = 1,
PERF_NORMAL = 2,
PERF_HIGH = 3
} GPUPerfMode;
typedef enum {
PRIORITY_DEFAULT = 0,
PRIORITY_LOW = 1,
PRIORITY_NORMAL = 2,
PRIORITY_HIGH = 3
} GPUPriorityLevel;
// Adreno extensions
// Adreno performance hints
typedef cl_uint cl_perf_hint;
#define CL_CONTEXT_PERF_MODE_QCOM 0x40C2
#define CL_PERF_MODE_HIGH_QCOM 0x40C3
#define CL_PERF_MODE_NORMAL_QCOM 0x40C4
#define CL_PERF_MODE_LOW_QCOM 0x40C5
// Adreno priority hints
typedef cl_uint cl_priority_hint;
#define CL_PRIORITY_HINT_NONE_QCOM 0
#define CL_CONTEXT_PRIORITY_LEVEL_QCOM 0x40C9
#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA
#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC
namespace paddle {
namespace lite {
......@@ -63,9 +102,28 @@ class CLRuntime {
bool InitializeDevice();
void GetAdrenoContextProperties(
std::vector<cl_context_properties>* properties,
GPUPerfMode gpu_perf_mode,
GPUPriorityLevel gpu_priority_level);
std::shared_ptr<cl::Context> CreateContext() {
auto context = std::make_shared<cl::Context>(
std::vector<cl::Device>{device()}, nullptr, nullptr, nullptr, &status_);
// note(ysh329): gpu perf mode and priority level of adreno gpu referred
// from xiaomi/mace.
// However, no performance gain after `PERF_HIGH` and `PRIORITY_HIGH` set.
auto perf_mode = GPUPerfMode::PERF_HIGH;
auto priority_level = GPUPriorityLevel::PRIORITY_HIGH;
std::vector<cl_context_properties> context_properties;
if (gpu_type_ == GpuType::QUALCOMM_ADRENO) {
GetAdrenoContextProperties(
&context_properties, perf_mode, priority_level);
}
auto context =
std::make_shared<cl::Context>(std::vector<cl::Device>{device()},
context_properties.data(),
nullptr,
nullptr,
&status_);
CL_CHECK_FATAL(status_);
return context;
}
......@@ -83,8 +141,12 @@ class CLRuntime {
return queue;
}
GpuType ParseGpuTypeFromDeviceName(std::string device_name);
std::map<std::string, size_t> device_info_;
GpuType gpu_type_{GpuType::UNKNOWN};
std::string cl_path_;
std::shared_ptr<cl::Platform> platform_{nullptr};
......
......@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \
__LINE__); \
}
#ifndef LITE_SHUTDOWN_LOG
#ifdef LITE_WITH_LOG
#define CL_CHECK_FATAL(err_code__) \
if (err_code__ != CL_SUCCESS) { \
LOG(FATAL) << string_format( \
......
......@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
T* output_data = output->template mutable_data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) {
for (int64_t j = 0; j < size; ++j) {
output_data[i * in_dims[0] + j] =
input_data[i * in_dims[0] + j] + vector_data[j];
output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
}
}
}
......
......@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
}
}
if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl;
VLOG(3) << "no input has value! just return";
return;
}
auto input_width = has_value_input->value().dims()[1];
......
......@@ -19,6 +19,7 @@ namespace lite {
#ifdef LITE_WITH_XPU
thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
#endif
} // namespace lite
......
......@@ -110,9 +110,7 @@ class Context<TargetType::kBM> {
Context() {}
explicit Context(const BMContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() { Init(0); }
void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); }
void InitOnce() { TargetWrapperBM::SetDevice(TargetWrapperBM::GetDevice()); }
void CopySharedTo(BMContext* ctx) {}
void* GetHandle() { return TargetWrapperBM::GetHandle(); }
......@@ -151,14 +149,23 @@ class Context<TargetType::kXPU> {
if (_tls_raw_ctx == nullptr) {
_tls_raw_ctx = xdnn::create_context();
CHECK(_tls_raw_ctx);
int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
_workspace_l3_size_per_thread);
if (r != 0) {
LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
<< ", _workspace_l3_size_per_thread = "
<< _workspace_l3_size_per_thread;
}
}
return _tls_raw_ctx;
}
static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
_workspace_l3_size_per_thread = l3_size;
}
// **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
// thread
static void SetDev(int dev_no = 0) {
const char* dev_env = getenv("LITE_XPU_DEV");
if (dev_env) {
......@@ -173,6 +180,7 @@ class Context<TargetType::kXPU> {
private:
static thread_local xdnn::Context* _tls_raw_ctx;
static int _workspace_l3_size_per_thread;
};
#endif
......
......@@ -1240,6 +1240,19 @@ void Device<TARGET(kMLU)>::CreateQueue() {
}
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_BM
void Device<TARGET(kBM)>::SetId(int device_id) {
LOG(INFO) << "Set bm device " << device_id;
TargetWrapper<TARGET(kBM)>::SetDevice(device_id);
idx_ = device_id;
}
void Device<TARGET(kBM)>::Init() { SetId(idx_); }
int Device<TARGET(kBM)>::core_num() {
return TargetWrapper<TARGET(kBM)>::num_devices();
}
#endif // LITE_WITH_BM
#ifdef LITE_WITH_CUDA
void Device<TARGET(kCUDA)>::Init() {
......
......@@ -221,6 +221,49 @@ class Device<TARGET(kMLU)> {
template class Env<TARGET(kMLU)>;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_BM
template <>
class Device<TARGET(kBM)> {
public:
Device(int dev_id, int max_stream = 1)
: idx_(dev_id), max_stream_(max_stream) {}
void Init();
int id() { return idx_; }
int max_stream() { return 1; }
std::string name() { return "BM"; }
float max_memory() { return 16; }
int core_num();
void SetId(int idx);
int sm_version() { return 0; }
bool has_fp16() { return false; }
bool has_int8() { return false; }
bool has_hmma() { return false; }
bool has_imma() { return false; }
int runtime_version() { return 0; }
private:
void CreateQueue() {}
void GetInfo() {}
private:
int idx_{0};
int max_stream_{1};
std::string device_name_;
float max_memory_;
int sm_version_;
bool has_fp16_;
bool has_int8_;
bool has_hmma_;
bool has_imma_;
int runtime_version_;
};
template class Env<TARGET(kBM)>;
#endif
#ifdef LITE_WITH_CUDA
template <>
class Device<TARGET(kCUDA)> {
......
......@@ -21,9 +21,13 @@ lite_cc_library(mir_passes
fusion/elementwise_add_activation_fuse_pass.cc
fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc
fusion/scale_activation_fuse_pass.cc
fusion/__xpu__resnet_fuse_pass.cc
fusion/__xpu__multi_encoder_fuse_pass.cc
fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
fusion/__xpu__fc_fuse_pass.cc
elimination/identity_scale_eliminate_pass.cc
elimination/identity_dropout_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace {
class Eliminator : public FuseBase {
public:
void BuildPattern() override {
// the previous op's output need updat
auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
// TODO(Superjomn) check has only one output
auto* x = VarNode("x")->assert_is_op_input("dropout", "X");
auto* dropout_op = OpNode("dropout", "dropout")
->assert_op_attr<int>("is_test", 1)
->assert_op_attr<std::string>(
"dropout_implementation", "upscale_in_train");
auto* out = VarNode("out")->assert_is_op_output("dropout", "Out");
auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask");
*pre_op >> *x >> *dropout_op >> *out;
*dropout_op >> *mask;
// The pre_op will be eliminated, and a new output-updated op will insert.
x->AsIntermediate(); // x is pre_op's output, need to update
dropout_op->AsIntermediate();
mask->AsIntermediate();
}
private:
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
auto& pre_op = matched.at("preop")->AsStmt();
auto op_info = *pre_op.op_info();
op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
matched.at("out")->AsArg().name);
pre_op.ResetOp(op_info, graph->valid_places());
IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
}
};
} // namespace
class IdentityDropoutEliminatePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
Eliminator eliminator;
eliminator(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(identity_dropout_eliminate_pass,
paddle::lite::mir::IdentityDropoutEliminatePass)
.BindTargets({TARGET(kXPU)});
......@@ -26,7 +26,9 @@ class Eliminator : public FuseBase {
public:
void BuildPattern() override {
// the previous op's output need updat
auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
auto* pre_op = OpNode("preop")
->assert_is_not_op_type("conditional_block")
->assert_is_not_op_type("scale");
// TODO(Superjomn) check has only one output
auto* x = VarNode("x")->assert_is_op_input("scale", "X");
auto* scale_op = OpNode("scale", "scale")
......
......@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate
lite_cc_library(fuse_sequence_pool_concat
SRCS sequence_pool_concat_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_scale_activation
SRCS scale_activation_fuser.cc
DEPS pattern_matcher_high_api)
set(mir_fusers
fuse_fc
......@@ -44,6 +47,7 @@ set(mir_fusers
fuse_transpose_softmax_transpose
fuse_interpolate
fuse_sequence_pool_concat
fuse_scale_activation
CACHE INTERNAL "fusers")
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUEmbeddingWithEltwiseAddFuser : public FuseBase {
public:
explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding)
: n_embedding_(n_embedding) {}
void BuildPattern() override {
auto* ids0 =
VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput();
auto* table0 =
VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput();
auto* embedding0 = OpNode("embedding0", "lookup_table");
auto* embedding_out0 = VarNode("embedding_out0")
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* ids1 =
VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput();
auto* table1 =
VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput();
auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate();
auto* embedding_out1 = VarNode("embedding_out1")
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate();
auto* ewadd01_out = VarNode("ewadd01_out")
->assert_is_op_output("elementwise_add", "Out")
->AsIntermediate();
embedding0->LinksFrom({ids0, table0});
embedding0->LinksTo({embedding_out0});
embedding1->LinksFrom({ids1, table1});
embedding1->LinksTo({embedding_out1});
ewadd01->LinksFrom({embedding_out0, embedding_out1});
ewadd01->LinksTo({ewadd01_out});
auto* last_ewadd_out = ewadd01_out;
for (int i = 2; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
auto table_name = paddle::lite::string_format("table%d", i);
auto embedding_name = paddle::lite::string_format("embedding%d", i);
auto embedding_out_name =
paddle::lite::string_format("embedding_out%d", i);
auto* new_ids = VarNode(ids_name)
->assert_is_op_input("lookup_table", "Ids")
->AsInput();
auto* new_table = VarNode(table_name)
->assert_is_op_input("lookup_table", "W")
->AsInput();
auto* new_embedding =
OpNode(embedding_name, "lookup_table")->AsIntermediate();
auto* new_embedding_out = VarNode(embedding_out_name)
->assert_is_op_output("lookup_table", "Out")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
new_embedding->LinksFrom({new_ids, new_table});
new_embedding->LinksTo({new_embedding_out});
auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i);
auto ewadd_out_name = ewadd_name + "_out";
auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate();
auto* new_ewadd_out = VarNode(ewadd_out_name)
->assert_is_op_output("elementwise_add", "Out")
->AsIntermediate();
new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out});
new_ewadd->LinksTo({new_ewadd_out});
last_ewadd_out = new_ewadd_out;
}
last_ewadd_out->AsOutput();
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("__xpu__embedding_with_eltwise_add");
std::vector<std::string> ids_names;
std::vector<std::string> table_names;
for (int i = 0; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
ids_names.push_back(matched.at(ids_name)->arg()->name);
auto table_name = paddle::lite::string_format("table%d", i);
table_names.push_back(matched.at(table_name)->arg()->name);
}
op_desc.SetInput("Ids", ids_names);
op_desc.SetInput("Tables", table_names);
auto output_name = paddle::lite::string_format(
"ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1);
op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name});
op_desc.SetAttr<int>("n_embedding", n_embedding_);
auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info();
op_desc.SetAttr<int64_t>(
"padding_idx", embedding0_op_info->GetAttr<int64_t>("padding_idx"));
auto* new_stmt = matched.at("embedding0")->stmt();
auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
new_op->Attach(op_desc, new_stmt->op()->scope());
new_op->SetValidPlaces(new_stmt->op()->valid_places());
auto kernels = new_op->CreateKernels(new_op->valid_places());
new_stmt->SetOp(new_op);
new_stmt->SetKernels(std::move(kernels));
for (int i = 0; i < n_embedding_; ++i) {
auto ids_name = paddle::lite::string_format("ids%d", i);
auto table_name = paddle::lite::string_format("table%d", i);
DirectedLink(matched.at(ids_name), matched.at("embedding0"));
DirectedLink(matched.at(table_name), matched.at("embedding0"));
}
IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name));
}
private:
int n_embedding_;
};
} // namespace fusion
class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
for (int n_embedding : {4, 3}) {
fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding);
fuser(graph.get());
}
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass,
paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("lookup_table");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUFcFuser : public FuseBase {
public:
explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {}
void BuildPattern() override {
// create nodes.
auto* x = VarNode("x")->assert_is_op_input("mul", "X");
auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
auto* b = VarNode("b")->assert_is_persistable_var();
auto* mul = OpNode("mul", "mul");
auto* mul_out = VarNode("mul_out");
auto* add = OpNode("add", "elementwise_add");
auto* Out = VarNode("Out");
// create topology.
std::vector<PMNode*> mul_inputs{W, x};
std::vector<PMNode*> add_inputs{mul_out, b};
mul_inputs >> *mul >> *mul_out;
// Some op specialities.
mul_out->AsIntermediate();
mul->AsIntermediate();
add->AsIntermediate();
if (with_relu_) {
auto* add_out = VarNode("add_out");
auto* relu = OpNode("relu", "relu");
std::vector<PMNode*> relu_inputs{add_out};
add_inputs >> *add >> *add_out;
relu_inputs >> *relu >> *Out;
add_out->AsIntermediate();
relu->AsIntermediate();
} else {
add_inputs >> *add >> *Out;
}
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
auto mul = matched.at("mul")->stmt()->op();
auto* scope = mul->scope();
// convert W from float to int16, and transpose W
auto weight_name = matched.at("W")->arg()->name;
auto* weight_t = scope->FindMutableTensor(weight_name);
auto weight_dims = weight_t->dims();
int weight_len = weight_t->numel();
float* weight_on_host = weight_t->mutable_data<float>();
float max_f =
paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
paddle::lite::xpu::math::ConvertFP32ToInt16(
weight_on_host, weight_int16.get(), max_f, weight_len);
paddle::lite::xpu::math::Transpose(weight_int16.get(),
weight_trans_int16.get(),
weight_dims[0],
weight_dims[1]);
memcpy(
weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t));
auto op_desc = GenOpDesc(matched, max_f, true);
auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc");
auto& valid_places = mul->valid_places();
fc_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
IR_NODE_LINK_TO(matched.at("W"), new_op_node);
IR_NODE_LINK_TO(matched.at("x"), new_op_node);
IR_NODE_LINK_TO(matched.at("b"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
}
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched,
float w_max,
bool transpose_w) {
cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
op_desc.mutable_inputs()->clear();
op_desc.mutable_outputs()->clear();
op_desc.SetType("__xpu__fc");
op_desc.SetInput("Input", {matched.at("x")->arg()->name});
op_desc.SetInput("W", {matched.at("W")->arg()->name});
op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
op_desc.SetAttr(
"in_num_col_dims",
matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
op_desc.SetAttr("w_max", w_max);
op_desc.SetAttr("transpose_w", transpose_w);
if (with_relu_) {
op_desc.SetAttr("activation_type", std::string{"relu"});
}
return op_desc;
}
bool with_relu_;
};
} // namespace fusion
class XPUFcFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
fusion::XPUFcFuser fuser(true /* with_relu */);
fuser(graph.get());
fusion::XPUFcFuser fuser2(false /* with_relu */);
fuser2(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("fc");
......@@ -16,6 +16,7 @@
#include <vector>
#include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/type_precision_cast_pass.h" // For UpdateInputs()
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/operators/subgraph_op.h"
......@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser {
multi_encoder_stmt->SetOp(multi_encoder_op);
multi_encoder_stmt->SetKernels(std::move(kernels));
// temp remove useless cast
std::unordered_set<const Node*> to_remove2;
// remove dangling/useless cast
Node* stack = nullptr;
for (auto* node : graph->StmtTopologicalOrder()) {
CHECK(node->IsStmt());
......@@ -597,16 +597,39 @@ class XPUMultiEncoderFuser {
stack = node;
}
}
Node* stack_out = stack->outlinks.front();
for (Node* cast : stack_out->outlinks) {
Node* cast_out = cast->outlinks.front();
if (cast_out->outlinks.size() == 0) {
// remove
to_remove2.insert(cast_out);
to_remove2.insert(cast);
if (stack) {
std::unordered_set<const Node*> to_remove2;
Node* stack_out = stack->outlinks.front();
// avoid modification while traversing
auto stack_out_outlinks = stack_out->outlinks;
for (Node* cast : stack_out_outlinks) {
if (cast->stmt()->op_info()->Type() != "cast") {
continue;
}
Node* cast_out = cast->outlinks.front();
if (cast_out->outlinks.size() == 0) {
// dangling cast
to_remove2.insert(cast);
to_remove2.insert(cast_out);
VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]";
} else if (cast_out->outlinks.size() == 1) {
// useless cast
to_remove2.insert(cast);
to_remove2.insert(cast_out);
VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]";
auto* multi_encoder = cast_out->outlinks.front();
DirectedLink(stack_out, multi_encoder);
UpdateInputs(multi_encoder->stmt()->op().get(),
cast_out->arg()->name,
stack_out->arg()->name);
auto update_op_info = *multi_encoder->stmt()->op_info();
multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places());
}
}
GraphSafeRemoveNodes(graph, to_remove2);
}
GraphSafeRemoveNodes(graph, to_remove2);
}
};
......
......@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
auto conv_weight_t =
scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
auto groups = conv_op_desc->GetAttr<int>("groups");
bool depthwise = false;
if (conv_type_ == "conv2d_transpose") {
depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[1]))
static_cast<size_t>(conv_weight_t->dims()[1] * groups))
<< "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights";
} else {
......@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
// compute new conv_weight for int8
auto weight_scale =
conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
if (conv_type_ == "conv2d_transpose") {
if (conv_type_ == "conv2d_transpose" && !depthwise) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
......@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} else {
// compute new conv_weight
auto conv_weight_d = conv_weight_t->mutable_data<float>();
if (conv_type_ == "conv2d_transpose") {
if (conv_type_ == "conv2d_transpose" && !depthwise) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
......
......@@ -23,11 +23,15 @@ namespace lite {
namespace mir {
void InterpolateFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fusion::InterpolateFuser bilinear_interp_fuser("bilinear_interp");
bilinear_interp_fuser(graph.get());
std::vector<std::string> Interpolate_type_cases{"bilinear_interp",
"nearest_interp"};
for (auto type_ : Interpolate_type_cases) {
fusion::InterpolateFuser interp_fuser(type_);
interp_fuser(graph.get());
fusion::InterpolateFuser nearest_interp_fuser("nearest_interp");
nearest_interp_fuser(graph.get());
fusion::InterpolateFuser2 interp_fuser2(type_);
interp_fuser2(graph.get());
}
}
} // namespace mir
......
......@@ -22,6 +22,9 @@ namespace mir {
namespace fusion {
void InterpolateFuser::BuildPattern() {
// type1 fill_constant -->
// x --> shape --> slice --> cast --> elementwise_mul --> interpolate
// `-------------------------------------------------->
auto* x = VarNode("x");
auto* shape = OpNode("shape", "shape")->AsIntermediate();
auto* shape_out = VarNode("shape_out")->AsIntermediate();
......@@ -89,6 +92,64 @@ cpp::OpDesc InterpolateFuser::GenOpDesc(const key2nodes_t& matched) {
return op_desc;
}
void InterpolateFuser2::BuildPattern() {
// type2 x --> shape --> slice --> cast --> scale --> interpolate
// `---------------------------------------->
auto* x = VarNode("x");
auto* shape = OpNode("shape", "shape")->AsIntermediate();
auto* shape_out = VarNode("shape_out")->AsIntermediate();
auto* slice = OpNode("slice", "slice")
->assert_op_attr_satisfied<std::vector<int>>(
"axes",
[](const std::vector<int>& attr) {
return attr.size() == 1 && attr[0] == 0;
})
->assert_op_attr_satisfied<std::vector<int>>(
"starts",
[](const std::vector<int>& attr) {
return attr.size() == 1 && attr[0] == 2;
})
->assert_op_attr_satisfied<std::vector<int>>(
"ends",
[](const std::vector<int>& attr) {
return attr.size() == 1 && attr[0] == 4;
})
->AsIntermediate();
auto* slice_out = VarNode("slice_out")->AsIntermediate();
auto* cast = OpNode("cast", "cast")->AsIntermediate();
auto* cast_out = VarNode("cast_out")->AsIntermediate();
auto* scale = OpNode("scale", "scale")->AsIntermediate();
auto* scale_out = VarNode("scale_out")->AsIntermediate();
auto* interpolate = OpNode("interpolate", interp_type_)->AsIntermediate();
auto* interpolate_out = VarNode("interpolate_out");
// create topology.
*x >> *shape >> *shape_out >> *slice >> *slice_out >> *cast >> *cast_out >>
*scale >> *scale_out >> *interpolate >> *interpolate_out;
*x >> *interpolate;
}
void InterpolateFuser2::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched);
auto interp_op = LiteOpRegistry::Global().Create(interp_type_);
auto interp_old = matched.at("interpolate")->stmt()->op();
auto* scope = interp_old->scope();
auto& valid_places = interp_old->valid_places();
interp_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(interp_op, valid_places);
IR_NODE_LINK_TO(matched.at("x"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("interpolate_out"));
}
cpp::OpDesc InterpolateFuser2::GenOpDesc(const key2nodes_t& matched) {
auto op_desc = *matched.at("interpolate")->stmt()->op_info();
op_desc.SetInput("OutSize", {});
return op_desc;
}
} // namespace fusion
} // namespace mir
} // namespace lite
......
......@@ -36,6 +36,19 @@ class InterpolateFuser : public FuseBase {
std::string interp_type_;
};
class InterpolateFuser2 : public FuseBase {
public:
explicit InterpolateFuser2(const std::string& interp_type)
: interp_type_(interp_type) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
std::string interp_type_;
};
} // namespace fusion
} // namespace mir
} // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/scale_activation_fuse_pass.h"
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/scale_activation_fuser.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void ScaleActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
for (auto act_type : {"relu", "relu6", "leaky_relu"}) {
fusion::ScaleActivationFuser fuser(act_type);
fuser(graph.get());
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_scale_activation_fuse_pass,
paddle::lite::mir::ScaleActivationFusePass)
.BindTargets({TARGET(kARM)})
.BindKernel("scale");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
class ScaleActivationFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/scale_activation_fuser.h"
#include <memory>
#include <vector>
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
void ScaleActivationFuser::BuildPattern() {
// create input nodes.
auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
// create op nodes
auto* scale =
OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate();
auto* act =
OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
// create intermediate nodes
auto* scale_out = VarNode("scale_out")
->assert_is_op_output("scale", "Out")
->assert_is_op_input(act_type_, "X")
->AsIntermediate();
// create output node
auto* out =
VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
// create topology.
*x >> *scale >> *scale_out;
*scale_out >> *act >> *out;
}
void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched);
auto scale_op = LiteOpRegistry::Global().Create("scale");
auto scale = matched.at("scale")->stmt()->op();
auto* scope = scale->scope();
auto& valid_places = scale->valid_places();
scale_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
IR_NODE_LINK_TO(matched.at("x"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("output"));
}
cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
op_desc.SetAttr("activation_type", act_type_);
if (act_type_ == "relu") {
op_desc.SetAttr("fuse_relu", true);
} else if (act_type_ == "relu6") {
float alpha = act_op_desc.GetAttr<float>("threshold");
op_desc.SetAttr("alpha", alpha);
} else if (act_type_ == "leaky_relu") {
float alpha = act_op_desc.GetAttr<float>("alpha");
op_desc.SetAttr("alpha", alpha);
}
return op_desc;
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class ScaleActivationFuser : public FuseBase {
public:
explicit ScaleActivationFuser(const std::string& act_type) {
act_type_ = act_type;
}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
std::string act_type_;
};
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -260,6 +260,9 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kRKNPU),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kRKNPU),
PRECISION(kAny),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kRKNPU),
PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
......
......@@ -71,12 +71,17 @@ class Optimizer {
"identity_scale_eliminate_pass", //
"elementwise_mul_constant_eliminate_pass", //
"lite_sequence_pool_concat_fuse_pass", //
"lite_scale_activation_fuse_pass", //
#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
(defined LITE_WITH_ARM)
"lite_elementwise_add_activation_fuse_pass", //
#endif
"__xpu__resnet_fuse_pass",
"__xpu__multi_encoder_fuse_pass",
"__xpu__embedding_with_eltwise_add_fuse_pass",
"__xpu__fc_fuse_pass",
"identity_dropout_eliminate_pass", // should be placed after
// xpu fusion
"quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer
// the output scale and
......
......@@ -22,6 +22,7 @@
#include <string>
#include <vector>
#include "lite/core/program.h"
#include "lite/fluid/float16.h"
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_image_converter.h"
......@@ -52,6 +53,24 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
return true;
}
static bool write_precision_summary_tofile(const std::string& string,
const std::string& log_dir = "") {
if (log_dir == "") {
LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:"
<< log_dir;
return false;
}
FILE* fp = fopen(log_dir.c_str(), "a");
if (fp == nullptr) {
LOG(INFO) << "Open precision summary file:" << log_dir << "failed.";
return false;
} else {
fprintf(fp, "%s\n", string.c_str());
}
fclose(fp);
return true;
}
class PrecisionProfiler {
public:
// TODO(ysh329): need to remove `explicit PrecisionProfiler`
......@@ -67,7 +86,7 @@ class PrecisionProfiler {
using std::left;
using std::fixed;
STL::stringstream ss;
ss << "========================================= "
ss << "\n\n========================================= "
<< "Detailed Precision Profiler Summary "
<< "=========================================" << std::endl;
ss << setw(45) << left << "operator:(kernel_info)"
......@@ -77,6 +96,13 @@ class PrecisionProfiler {
<< " " << setw(15) << left << "std_deviation"
<< " " << setw(15) << left << "ave_grow_rate*" << std::endl;
// write to file with path: `log_dir`
if (log_dir_ != "") {
FILE* fp = fopen(log_dir_.c_str(), "a");
std::string header_str{ss.str()};
fprintf(fp, "%s\n", header_str.c_str());
fclose(fp);
}
return ss.str();
}
......@@ -194,6 +220,7 @@ class PrecisionProfiler {
}
#ifdef LITE_WITH_OPENCL
} else if (target_type == TARGET(kOpenCL)) {
CLRuntime::Global()->command_queue().finish();
switch (layout_type) {
case DATALAYOUT(kImageDefault): {
paddle::lite::CLImageConverterDefault default_convertor;
......@@ -360,8 +387,12 @@ class PrecisionProfiler {
}
}
}
write_precision_summary_tofile(ss.str(), log_dir_);
return ss.str();
}
private:
std::string log_dir_{"/storage/emulated/0/precision.log"};
};
} // namespace profile
......
......@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const {
return nullptr;
}
// AttributeVarNames will get persistive attribute names stored in parent scope
std::vector<std::string> Scope::AttributeVarNames() const {
std::vector<std::string> resulted_keys;
const Scope *cur_scope = this;
while (cur_scope->parent()) {
cur_scope = cur_scope->parent();
auto keys = cur_scope->LocalVarNames();
resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end());
}
// remove feed and fetch
std::vector<std::string> skiped_vars = {"feed", "fetch"};
for (int i = 0; i < skiped_vars.size(); i++) {
auto iter =
std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
while (iter != resulted_keys.end()) {
resulted_keys.erase(iter);
iter =
std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
}
}
return resulted_keys;
}
std::vector<std::string> Scope::LocalVarNames() const {
std::vector<std::string> keys;
for (const auto &item : vars_) {
......
......@@ -45,6 +45,8 @@ class Scope final {
const Scope* parent() const { return parent_; }
// Get attribute params stored in parent scopes.
std::vector<std::string> AttributeVarNames() const;
// Following the legacy scope interface.
std::vector<std::string> LocalVarNames() const;
......
......@@ -54,7 +54,7 @@ git checkout release/v2.3
--arm_lang=gcc \
--android_stl=c++_static \
--build_extra=ON \
--shutdown_log=OFF \
--with_log=ON \
full_publish
```
......
project(demo CXX C)
cmake_minimum_required(VERSION 2.8)
project(demo CXX C)
add_definitions(-DLITE_WITH_CUDA)
set(TARGET demo)
set(CMAKE_CXX_FLAGS "-std=c++11 -O3")
set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx")
set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx")
set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
include_directories("${LITE_LIB}/include")
link_directories("${LITE_LIB}/lib")
link_directories("${PROTOBUF_LIB}/lib")
include_directories("${LITE_ROOT}/include")
link_directories("${LITE_ROOT}/lib")
link_directories("${PROTOBUF_ROOT}/lib")
# cuda lib
link_directories("/usr/local/cuda/lib64/")
add_executable(${TARGET} ${TARGET}.cc)
set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so)
set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so)
set(DEPS ${DEPS} protobuf-lite)
set(DEPS ${DEPS} "-lrt -lpthread -ldl")
set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart")
target_link_libraries(${TARGET} ${DEPS})
ARM_ABI = arm7
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
##########
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
test_helper.o: test_helper.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
classification_full.o: classification_full.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
classification_light.o: classification_light.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
classification_full_shared: fetch_opencv classification_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
classification_full_static: fetch_opencv classification_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
classification_light_shared: fetch_opencv classification_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
classification_light_static: fetch_opencv classification_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
######
yolov3_full.o: yolov3_full.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
yolov3_light.o: yolov3_light.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
#####
all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static
clean:
rm -f *.o
rm -f classification_full_shared
rm -r classification_full_static
rm -r classification_light_shared
rm -f classification_light_static
rm -f yolov3_full_shared
rm -f yolov3_full_static
rm -f yolov3_light_shared
rm -f yolov3_light_static
ARM_ABI = arm8
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
##########
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
test_helper.o: test_helper.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
classification_full.o: classification_full.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
classification_light.o: classification_light.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
classification_full_shared: fetch_opencv classification_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
classification_full_static: fetch_opencv classification_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
classification_light_shared: fetch_opencv classification_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
classification_light_static: fetch_opencv classification_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
######
yolov3_full.o: yolov3_full.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
yolov3_light.o: yolov3_light.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
#####
all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static
clean:
rm -f *.o
rm -f classification_full_shared
rm -r classification_full_static
rm -r classification_light_shared
rm -f classification_light_static
rm -f yolov3_full_shared
rm -f yolov3_full_static
rm -f yolov3_light_shared
rm -f yolov3_light_static
**测试PaddleLite C++预测库**
1、编译full_publish预测库,需要打开build_extra,比如 `./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static --build_extra=ON full_publish`
2、进入编译产出的目录,比如 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/test_libs`,执行 `sh prepare.sh`,得到所有测试文件在 `test_lite_lib_files` 文件中
3、将 `test_lite_lib_files` 文件push到手机上,进入手机端 `test_lite_lib_files` 目录,执行 `sh run.sh`,查看log信息统计测试结果,其中涵盖测试light库、full库、动态库和静态库。
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <fstream>
#include <iostream>
#include "paddle_api.h" // NOLINT
#include "test_helper.h" // NOLINT
DEFINE_string(model_dir,
"",
"the path of the model, the model and param files is under "
"model_dir.");
DEFINE_string(model_filename,
"",
"the filename of model file. When the model is combined formate, "
"please set model_file.");
DEFINE_string(param_filename,
"",
"the filename of param file, set param_file when the model is "
"combined formate.");
DEFINE_string(img_path, "", "the path of input image");
DEFINE_string(img_txt_path,
"",
"the path of input image, the image is processed "
" and saved in txt file");
DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
DEFINE_double(threshold,
1e-3,
"If the max value diff is smaller than threshold, pass test");
DEFINE_int32(out_max_value_index, 65, "The max value index in output tensor");
// Optimize model for ARM CPU.
// If the model is not combined, set model_filename and params_filename as empty
void OptModel(const std::string& load_model_dir,
const std::string& model_filename,
const std::string& params_filename,
const std::string& save_model_path) {
paddle::lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
if (!model_filename.empty() && !params_filename.empty()) {
config.set_model_file(load_model_dir + "/" + model_filename);
config.set_param_file(load_model_dir + "/" + params_filename);
}
std::vector<paddle::lite_api::Place> vaild_places = {
paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)},
paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)},
};
config.set_valid_places(vaild_places);
auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
std::string cmd_str = "rm -rf " + save_model_path;
int ret = system(cmd_str.c_str());
if (ret == 0) {
std::cout << "Delete old optimized model " << save_model_path << std::endl;
}
predictor->SaveOptimizedModel(save_model_path,
paddle::lite_api::LiteModelType::kNaiveBuffer);
std::cout << "Load model from " << load_model_dir << std::endl;
std::cout << "Save optimized model to " << save_model_path << std::endl;
}
void Run(const std::string& model_path,
const std::string& img_path,
const std::string& img_txt_path,
const float out_max_value,
const int out_max_value_index,
const float threshold,
const int height,
const int width) {
// set config and create predictor
paddle::lite_api::MobileConfig config;
config.set_threads(3);
config.set_model_from_file(model_path);
auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
// set input
auto input_tensor = predictor->GetInput(0);
input_tensor->Resize({1, 3, height, width});
auto input_data = input_tensor->mutable_data<float>();
if (img_txt_path.size() > 0) {
std::fstream fs(img_txt_path);
if (!fs.is_open()) {
std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
}
int num = 1 * 3 * height * width;
for (int i = 0; i < num; i++) {
fs >> input_data[i];
}
} else {
cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
if (!img.data) {
std::cerr << "Fail to open img:" << img_path << std::endl;
exit(1);
}
float means[3] = {0.485f, 0.456f, 0.406f};
float scales[3] = {0.229f, 0.224f, 0.225f};
process_img(img, width, height, input_data, means, scales);
}
predictor->Run();
auto out_tensor = predictor->GetOutput(0);
auto* out_data = out_tensor->data<float>();
int64_t output_num = ShapeProduction(out_tensor->shape());
float max_value = out_data[0];
int max_index = 0;
for (int i = 0; i < output_num; i++) {
if (max_value < out_data[i]) {
max_value = out_data[i];
max_index = i;
}
}
std::cout << "max_value:" << max_value << std::endl;
std::cout << "max_index:" << max_index << std::endl;
std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
if (max_index != out_max_value_index ||
fabs(max_value - out_max_value) > threshold) {
std::cerr << "----------Fail Test.---------- \n\n";
} else {
std::cout << "----------Pass Test.---------- \n\n";
}
}
int main(int argc, char** argv) {
// Check inputs
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir.empty() ||
(FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
std::cerr << "Input error." << std::endl;
std::cerr
<< "Usage: " << argv[0] << std::endl
<< "--model_dir: the path of not optimized model \n"
"--model_filename: the model filename of not optimized model \n"
"--param_filename: the param filename of not optimized model \n"
"--img_txt_path: the path of input image, the image is processed \n"
" and saved in txt file \n"
"--img_path: the path of input image \n"
"--out_max_value: The max value in output tensor \n"
"--threshold: If the max value diff is smaller than threshold,\n"
" pass test. Default 1e-3.\n"
"--out_max_value_index: The max value index in output tensor \n";
exit(1);
}
const int height = 224;
const int width = 224;
std::string model_dir = FLAGS_model_dir;
if (model_dir.back() == '/') {
model_dir.pop_back();
}
std::string optimized_model_path = model_dir + "_opt2";
OptModel(FLAGS_model_dir,
FLAGS_model_filename,
FLAGS_param_filename,
optimized_model_path);
std::string run_model_path = optimized_model_path + ".nb";
// Run test
Run(run_model_path,
FLAGS_img_path,
FLAGS_img_txt_path,
FLAGS_out_max_value,
FLAGS_out_max_value_index,
FLAGS_threshold,
height,
width);
return 0;
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <fstream>
#include <iostream>
#include "paddle_api.h" // NOLINT
#include "test_helper.h" // NOLINT
DEFINE_string(optimized_model_path, "", "the path of optimized model");
DEFINE_string(img_path, "", "the path of input image");
DEFINE_string(img_txt_path,
"",
"the path of input image, the image is processed "
" and saved in txt file");
DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
DEFINE_double(threshold,
1e-3,
"If the max value diff is smaller than threshold, pass test");
DEFINE_int32(out_max_value_index, -1, "The max value index in output tensor");
void Run(const std::string& model_path,
const std::string& img_path,
const std::string& img_txt_path,
const float out_max_value,
const int out_max_value_index,
const float threshold,
const int height,
const int width) {
// set config and create predictor
paddle::lite_api::MobileConfig config;
config.set_threads(3);
config.set_model_from_file(model_path);
auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
// set input
auto input_tensor = predictor->GetInput(0);
input_tensor->Resize({1, 3, height, width});
auto input_data = input_tensor->mutable_data<float>();
if (img_txt_path.size() > 0) {
std::fstream fs(img_txt_path);
if (!fs.is_open()) {
std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
}
int num = 1 * 3 * height * width;
for (int i = 0; i < num; i++) {
fs >> input_data[i];
}
} else {
cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
if (!img.data) {
std::cerr << "Fail to open img:" << img_path << std::endl;
exit(1);
}
float means[3] = {0.485f, 0.456f, 0.406f};
float scales[3] = {0.229f, 0.224f, 0.225f};
process_img(img, width, height, input_data, means, scales);
}
predictor->Run();
auto out_tensor = predictor->GetOutput(0);
auto* out_data = out_tensor->data<float>();
int64_t output_num = ShapeProduction(out_tensor->shape());
float max_value = out_data[0];
int max_index = 0;
for (int i = 0; i < output_num; i++) {
if (max_value < out_data[i]) {
max_value = out_data[i];
max_index = i;
}
}
std::cout << "max_value:" << max_value << std::endl;
std::cout << "max_index:" << max_index << std::endl;
std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
if (max_index != out_max_value_index ||
fabs(max_value - out_max_value) > threshold) {
std::cerr << "----------Fail Test---------- \n\n";
} else {
std::cout << "----------Pass Test---------- \n\n";
}
}
int main(int argc, char** argv) {
// Check inputs
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_optimized_model_path.empty() ||
(FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
std::cerr << "Input error." << std::endl;
std::cerr
<< "Usage: " << argv[0] << std::endl
<< "--optimized_model_path: the path of optimized model \n"
"--img_txt_path: the path of input image, the image is processed \n"
" and saved in txt file \n"
"--img_path: the path of input image \n"
"--out_max_value: The max value in output tensor \n"
"--threshold: If the max value diff is smaller than threshold,\n"
" pass test. Default 1e-3.\n"
"--out_max_value_index: The max value index in output tensor \n";
exit(1);
}
const int height = 224;
const int width = 224;
// Run test
Run(FLAGS_optimized_model_path,
FLAGS_img_path,
FLAGS_img_txt_path,
FLAGS_out_max_value,
FLAGS_out_max_value_index,
FLAGS_threshold,
height,
width);
return 0;
}
make clean
make all -j
gf=test_lite_lib_files
if [ -d ${gf} ];then
rm -rf ${gf}
fi
mkdir ${gf}
mv classification_full_shared ${gf}
mv classification_full_static ${gf}
mv classification_light_shared ${gf}
mv classification_light_static ${gf}
mv yolov3_full_shared ${gf}
mv yolov3_full_static ${gf}
mv yolov3_light_shared ${gf}
mv yolov3_light_static ${gf}
cp run.sh ${gf}
make clean
cp -r ../../../cxx/ ${gf}
mv ${gf}/cxx ${gf}/lite
if [ ! -f "test_libs_models_imgs.tgz" ];then
wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/test_libs_models_imgs.tgz
fi
tar zxvf test_libs_models_imgs.tgz
mv test_libs_models_imgs ${gf}
mv ${gf}/test_libs_models_imgs ${gf}/models_imgs
export LD_LIBRARY_PATH=$PWD/lite/lib/:${LD_LIBRARY_PATH}
# mobilenetv1
model_name="mobilenetv1"
input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.936887 \
--out_max_value_index=65"
echo "Test ${model_name}: light_shared, light_static, full_shared, full_static."
./classification_light_shared ${input_params} \
--optimized_model_path=models_imgs/models/mobilenetv1.nb
./classification_light_static ${input_params} \
--optimized_model_path=models_imgs/models/mobilenetv1.nb
./classification_full_shared ${input_params} \
--model_dir=models_imgs/models/mobilenetv1
./classification_full_static ${input_params} \
--model_dir=models_imgs/models/mobilenetv1
# mobilenetv2
model_name="mobilenetv2"
input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.868888 \
--out_max_value_index=65"
echo "Test ${model_name}: light_shared, light_static, full_shared, full_static."
./classification_light_shared ${input_params} \
--optimized_model_path=models_imgs/models/mobilenetv2.nb
./classification_light_static ${input_params} \
--optimized_model_path=models_imgs/models/mobilenetv2.nb
./classification_full_shared ${input_params} \
--model_dir=models_imgs/models/mobilenetv2
./classification_full_static ${input_params} \
--model_dir=models_imgs/models/mobilenetv2
# shufflenetv2
model_name="shufflenetv2"
input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \
--out_max_value=0.776729 \
--out_max_value_index=65"
echo "Test ${model_name}: light_shared, light_static, full_shared, full_static."
./classification_light_shared ${input_params} \
--optimized_model_path=models_imgs/models/shufflenetv2.nb
./classification_light_static ${input_params} \
--optimized_model_path=models_imgs/models/shufflenetv2.nb
./classification_full_shared ${input_params} \
--model_dir=models_imgs/models/shufflenetv2
./classification_full_static ${input_params} \
--model_dir=models_imgs/models/shufflenetv2
# yolov3
model_name="yolov3"
input_params="--img_txt_path=models_imgs/images/yolov3.jpg.txt \
--out_values=0,0.153605,174.494,199.729,562.075,604.014"
echo "Test ${model_name}: light_shared, light_static, full_shared, full_static."
./yolov3_light_shared ${input_params} \
--optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb
./yolov3_light_static ${input_params} \
--optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb
./yolov3_full_shared ${input_params} \
--model_dir=models_imgs/models/yolov3_mobilenetv1
./yolov3_full_static ${input_params} \
--model_dir=models_imgs/models/yolov3_mobilenetv1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "test_helper.h" // NOLINT
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <cstdio>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <numeric>
#include <string>
#include <vector>
#include "opencv2/core.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
}
int64_t ShapeProduction(const std::vector<int64_t>& shape) {
int64_t num = 1;
for (auto i : shape) {
num *= i;
}
return num;
}
std::vector<int64_t> GetIntNumsFromStr(const std::string& str) {
std::vector<int64_t> nums;
std::string tmp_str = str;
while (!tmp_str.empty()) {
int num = atoi(tmp_str.data());
nums.push_back(num);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return nums;
}
std::vector<double> GetDoubleNumsFromStr(const std::string& str) {
std::vector<double> nums;
std::string tmp_str = str;
while (!tmp_str.empty()) {
double num = atof(tmp_str.data());
nums.push_back(num);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return nums;
}
// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
void neon_mean_scale(
const float* din, float* dout, int size, float* mean, float* scale) {
float32x4_t vmean0 = vdupq_n_f32(mean[0]);
float32x4_t vmean1 = vdupq_n_f32(mean[1]);
float32x4_t vmean2 = vdupq_n_f32(mean[2]);
float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
float* dout_c0 = dout;
float* dout_c1 = dout + size;
float* dout_c2 = dout + size * 2;
int i = 0;
for (; i < size - 3; i += 4) {
float32x4x3_t vin3 = vld3q_f32(din);
float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
vst1q_f32(dout_c0, vs0);
vst1q_f32(dout_c1, vs1);
vst1q_f32(dout_c2, vs2);
din += 12;
dout_c0 += 4;
dout_c1 += 4;
dout_c2 += 4;
}
for (; i < size; i++) {
*(dout_c0++) = (*(din++) - mean[0]) / scale[0];
*(dout_c0++) = (*(din++) - mean[1]) / scale[1];
*(dout_c0++) = (*(din++) - mean[2]) / scale[2];
}
}
// Process img and set it as input
void process_img(const cv::Mat& img,
int width,
int height,
float* dest_data,
float* means,
float* scales) {
cv::Mat rgb_img;
cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
cv::Mat imgf;
rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
const float* dimg = reinterpret_cast<const float*>(imgf.data);
neon_mean_scale(dimg, dest_data, width * height, means, scales);
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "opencv2/core.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
double GetCurrentUS();
int64_t ShapeProduction(const std::vector<int64_t>& shape);
std::vector<int64_t> GetIntNumsFromStr(const std::string& str);
std::vector<double> GetDoubleNumsFromStr(const std::string& str);
void neon_mean_scale(
const float* din, float* dout, int size, float* mean, float* scale);
void process_img(const cv::Mat& img,
int width,
int height,
float* dst_data,
float* means,
float* scales);
此差异已折叠。
此差异已折叠。
cmake_minimum_required(VERSION 2.8)
set(TARGET mobilenet_full_api)
# 1. path to Paddle-Lite lib and mklml lib
set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
# 2. link mklml and Paddle-Lite directory
link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
# 3. compile options
add_definitions(-std=c++11 -g -O3 -pthread)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
# 4.add executable output
add_executable(${TARGET} ${TARGET}.cc)
target_link_libraries(${TARGET} -lpaddle_full_api_shared)
target_link_libraries(${TARGET} -lmklml_intel)
target_link_libraries(${TARGET} -ldl)
mkdir ./build
cd ./build
cmake ..
make
cd ..
rm -rf ./build
此差异已折叠。
mkdir ./build
cd ./build
cmake ..
make
cd ..
rm -rf ./build
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册