未验证 提交 7cccc85c 编写于 作者: H HappyAngel 提交者: GitHub

Merge branch 'develop' into conv_dw_5x5

...@@ -23,6 +23,9 @@ if(NOT DEFINED BM_SDK_ROOT) ...@@ -23,6 +23,9 @@ if(NOT DEFINED BM_SDK_ROOT)
endif() endif()
endif() endif()
set(BM_SDK_CPLIB_RPATH ${BM_SDK_ROOT}/lib/bmcompiler)
set(BM_SDK_LIB_RPATH ${BM_SDK_ROOT}/lib/bmnn/pcie)
message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}") message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}")
find_path(BM_SDK_INC NAMES bmruntime_interface.h find_path(BM_SDK_INC NAMES bmruntime_interface.h
PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH) PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH)
...@@ -37,43 +40,35 @@ include_directories("${BM_SDK_ROOT}/include/bmcpu") ...@@ -37,43 +40,35 @@ include_directories("${BM_SDK_ROOT}/include/bmcpu")
include_directories("${BM_SDK_ROOT}/include/bmlog") include_directories("${BM_SDK_ROOT}/include/bmlog")
find_library(BM_SDK_RT_LIB NAMES bmrt find_library(BM_SDK_RT_LIB NAMES bmrt
PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie) PATHS ${BM_SDK_LIB_RPATH})
if(NOT BM_SDK_RT_LIB) if(NOT BM_SDK_RT_LIB)
message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}") message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}")
else() else()
message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}") message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}")
add_library(bmrt SHARED IMPORTED GLOBAL)
set_property(TARGET bmrt PROPERTY IMPORTED_LOCATION ${BM_SDK_RT_LIB})
endif() endif()
find_library(BM_SDK_BM_LIB NAMES bmlib find_library(BM_SDK_BM_LIB NAMES bmlib
PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie) PATHS ${BM_SDK_LIB_RPATH})
if(NOT BM_SDK_BM_LIB) if(NOT BM_SDK_BM_LIB)
message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}") message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}")
else() else()
message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}") message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}")
add_library(bmlib SHARED IMPORTED GLOBAL)
set_property(TARGET bmlib PROPERTY IMPORTED_LOCATION ${BM_SDK_BM_LIB})
endif() endif()
find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler
PATHS ${BM_SDK_ROOT}/lib/bmcompiler) PATHS ${BM_SDK_CPLIB_RPATH})
if(NOT BM_SDK_COMPILER_LIB) if(NOT BM_SDK_COMPILER_LIB)
message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}") message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}")
else() else()
message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}") message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}")
add_library(bmcompiler SHARED IMPORTED GLOBAL)
set_property(TARGET bmcompiler PROPERTY IMPORTED_LOCATION ${BM_SDK_COMPILER_LIB})
endif() endif()
find_library(BM_SDK_CPU_LIB NAMES bmcpu find_library(BM_SDK_CPU_LIB NAMES bmcpu
PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie) PATHS ${BM_SDK_LIB_RPATH})
if(NOT BM_SDK_CPU_LIB) if(NOT BM_SDK_CPU_LIB)
message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}") message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}")
else() else()
message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}") message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}")
add_library(bmcpu SHARED IMPORTED GLOBAL)
set_property(TARGET bmcpu PROPERTY IMPORTED_LOCATION ${BM_SDK_CPU_LIB})
endif() endif()
set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs") set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs")
......
...@@ -44,6 +44,8 @@ sh run_benchmark.sh ...@@ -44,6 +44,8 @@ sh run_benchmark.sh
3. 自动执行另一个脚本`benchmark.sh`(多台手机连接USB,请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`); 3. 自动执行另一个脚本`benchmark.sh`(多台手机连接USB,请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`);
4. 从手机下载benchmark结果`result_armv7.txt``result_armv8.txt`,到当前目录,并显示Benchmark结果。 4. 从手机下载benchmark结果`result_armv7.txt``result_armv8.txt`,到当前目录,并显示Benchmark结果。
> **注意:** 如果运行中遇到`Operation not permitted`的问题,请使用`sudo +sh run_benchmark.sh`给予授权,并尝试重新关闭/打开手机**USB调试**和**文件传输模式**,或者通过USB重新连接手机之后再次运行脚本。
## 二. 逐步Benchmark ## 二. 逐步Benchmark
### 1. 编译benchmark可执行文件 ### 1. 编译benchmark可执行文件
......
...@@ -36,9 +36,11 @@ ...@@ -36,9 +36,11 @@
**需要的环境**: Android Studio、Android手机(开启USB调试模式)、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程 **需要的环境**: Android Studio、Android手机(开启USB调试模式)、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
**预先要求**:如果您的Android Studio尚未配置NDK,请根据Android Studio用户指南中的[安装及配置NDK和CMake](https://developer.android.com/studio/projects/install-ndk)内容,预先配置好NDK。您可以选择最新的NDK版本,或者与[Android编译环境配置](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#android)中的NDK版本保持一致。
**部署步骤** **部署步骤**
1、 目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo` 1、目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo`
2、用Android Studio 打开object_detection_demo工程 (本步骤需要联网)。 2、用Android Studio 打开object_detection_demo工程 (本步骤需要联网)。
...@@ -46,12 +48,17 @@ ...@@ -46,12 +48,17 @@
![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png) ![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png)
**注意:** 如果您在导入项目、编译或者运行过程中遇到NDK配置错误的提示,请打开 File > Project Structure > SDK Location,修改 "Andriod NDK location" 为您本机配置的NDK所在路径。如果您是通过Andriod Studio的SDK Tools下载的NDK (见本章节"预先要求"),可以直接点击下拉框选择默认路径。如果以上步骤仍旧无法解决NDK配置错误,请尝试根据Andriod Studio官方文档中的[更新 Android Gradle 插件](https://developer.android.com/studio/releases/gradle-plugin?hl=zh-cn#updating-plugin)章节,尝试更新Android Gradle plugin版本。
<p align="center"><img width="600" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Andriod_Studio_NDK.png"/>
4、按下 Run按钮,自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型,需要联网) 4、按下 Run按钮,自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型,需要联网)
成功后效果如下,图一:APP安装到手机 图二: APP打开后的效果,会自动识别图片中的物体并标记 成功后效果如下,图一:APP安装到手机 图二: APP打开后的效果,会自动识别图片中的物体并标记
<p align="center"><img width="300" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp0.png"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp1.jpg"/></p> <p align="center"><img width="300" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp0.png"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp1.jpg"/></p>
## Android demo结构讲解 ## Android demo结构讲解
Android 示例的代码结构如下图所示: Android 示例的代码结构如下图所示:
......
此差异已折叠。
# PaddleLite使用NPU(华为)预测部署
Paddle Lite是首款支持华为自研达芬奇架构NPU(Kirin 810/990 SoC搭载的NPU)的预测框架。
原理是在线分析Paddle模型,将Paddle算子转成HiAI IR后,调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。
## 已支持的设备
- 华为nova5、nova5i pro、mate30、mate30 pro、mate30 5G、荣耀v30、p40、p40 pro,以及即将推出的mate40、。据华为透露,今后上市的大部分手机都会搭载其自研达芬奇架构NPU。
## 已支持的模型
- MobileNetV1
- MobileNetV2
- ResNet-18/50
- ShuffleNetV2
- squeezenet
- mnasnet
- yolov3
- CycleGAN (暂时需要华为内部rom的支持)
- 百度内部业务模型(由于涉密,不方便透露具体细节)
*CPU/NPU混合调度在部分模型可以获得更佳的性能*
## 已支持(或部分支持)的Paddle算子
- sigmoid
- relu
- tanh
- relu_clipped
- leaky_relu
- softsign
- hard_sigmoid
- batch_norm
- concat
- conv2d
- depthwise_conv2d
- conv2d_transpose
- dropout
- elementwise_add
- elementwise_sub
- elementwise_mul
- elementwise_div
- fusion_elementwise_add_activation
- fusion_elementwise_sub_activation
- fusion_elementwise_mul_activation
- fusion_elementwise_div_activation
- fc
- bilinear_interp
- nearest_interp
- matmul
- mul
- pad2d
- pool2d
- reduce_mean
- reshape
- reshape2
- scale
- shuffle_channel
- softmax
- split
- sqrt
- square
- transpose
- transpose2
- unsqueeze
- unsqueeze2
- instance_norm (暂时需要华为内部rom的支持)
- layer_norm (暂时需要华为内部rom的支持)
## 编译支持NPU的Paddle Lite库
-[华为HiAI平台](https://developer.huawei.com/consumer/cn/hiai)下载华为HiAI DDK后解压到任意路径(注意:华为提供了多个版本的DDK,我们需要下载针对麒麟810/990芯片HiAI Foundation开发套件,例如[DDK V310版本](https://obs.cn-north-2.myhwclouds.com/hms-ds-wf/sdk/hwhiai-ddk-100.310.011.010.zip))。
- 将HiAI DDK中的ai_ddk_lib目录拷贝至Paddle Lite源码根目录后,使用[编译脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_android.sh)编译 (需要指定NPU相关选项)。
注意:以下是HiAI DDK V310版解压后的目录结构,需要将ai_ddk_lib目录拷贝至Paddle Lite源码根目录。
```shell
- app_sample
- ddk
- ai_ddk_lib
- include
- lib # for armv7
- lib64 # for armv8
- document
- tools
```
- 推荐编译命令。由于HiAI DDK的so库均基于c++_shared构建,因此,建议使用c++_shared编译Paddle Lite。
```shell
# huawei_kirin_npu_sdk_root 需要指向 ai_ddk_lib 的路径
$ ./lite/tools/build_android.sh --android_stl=c++_shared --with_huawei_kirin_npu=ON --huawei_kirin_npu_sdk_root=<path-to-ai_ddk_lib>
# 其它选项可以通过 "./lite/tools/build_android.sh help" 查看,例如arm版本等
```
注意:为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置,然后再执行上述命令。
## 优化生成NPU模型
- model_optimize_tool工具已经支持生成NPU模型,仅需要将valid_targets设置为npu,arm即可,具体参考[模型转化方法](../user_guides/model_optimize_tool)
```shell
./model_optimize_tool --model_dir=<model_param_dir> \
--model_file=<model_path> \
--param_file=<param_path> \
--optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \
--valid_targets=npu,arm \
--record_tailoring_info =(true|false)
```
- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子,并没有真正生成NPU HiAI模型,只有在执行时才会将标记的Paddle算子转成HiAI IR,最终生成并执行HiAI模型,具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)
- 不同模型,不同型号(ROM版本)的华为手机,在执行阶段,由于某些Paddle算子无法完全转成HiAI IR,或目标手机的HiAI版本过低等原因,可能导致HiAI模型无法成功生成,在这种情况下,Paddle Lite会调用CPU版算子进行运算完成整个预测任务。
## 通过JAVA接口加载并执行NPU模型
**注意:由于华为手机root权限限制,现在仅支持JAVA接口加载和执行NPU模型**
- 使用方法和[Java实例](java_demo)一致,无需额外设置任何参数,只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。
注意:在拷贝libpaddle_lite_jni.so的时候,由于依赖HiAI DDK so和libc++_shared.so库,需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so,拷到libpaddle_lite_jni.so同级目录下。
## 其它说明
- 华为达芬奇架构的NPU内部大量采用float16进行运算,因此,预测结果会存在偏差,但大部分情况下精度不会有较大损失,可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。
- 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU,与Kirin 970/980 Soc搭载的寒武纪NPU不一样,同样的,与Hi3559A、Hi3519A使用的NNIE也不一样,Paddle Lite只支持华为自研达芬奇架构NPU。
- 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter,以便适配更多Paddle模型,同时华为研发同学也在持续对HiAI IR性能进行优化。
## 手动分割子图
### 背景
- Paddle-Lite已经支持了大量的华为NPU的算子,但是仍然不能满足所有模型的需求。对于一个有部分算子不支持的模型,Paddle-Lite会将模型划分为可以跑在NPU上的子图和跑在CPU上的子图,实现NPU和CPU自动调度的功能,通常情况下可以获得比较好的性能。在一些特殊情况下,模型会被自动划分为比较多的子图,导致CPU和NPU的切换开销很大,从而导致整体性能变差。因此,需要手动分割子图的功能来指定一些算子跑在CPU上,避免子图过多。
### 功能
- 通过配置文件来指定需要强制跑在CPU上的算子
### 使用方法
- 1、通过netron打开paddle模型文件,可以查看模型结构,获得算子的类型、输入名称。输出名称。
- 注意:Paddle-Lite会对模型进行优化,模型算子可以改变,需要以优化后的模型算子为准。后面会举例说明。
- 2、生成配置文件 ```split_cfg.txt```,记录需要跑在CPU上的算子信息。
- 每行一条OP记录信息,以冒号":"分隔"op名称","op输入名","op输出名",以逗号","分隔"op输入名"和"op输出名"中的不同var名。
- 可以部分省略输入或者输出名。比如:```op3:in3_var0```表示,指定类型为"op3",输入为"in3_var0"的算子;```op4```表示所有类型为"op4"的算子
- 例子1:
```
op0:in0_var0,in0_var1:out0_var0,out0_var1
op1:in1_var0,in1_var1:out1_var0
op2::out2_var0
op3:in3_var0
op4
```
- 例子2:
```
transpose:conv2d_22.tmp_1:transpose_0.tmp_0
```
![image](https://user-images.githubusercontent.com/50474132/80475316-4a5fda80-897b-11ea-910a-6aee13243387.png)
- 3、使用环境变量```SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE```指定配置文件的位置。
- 例如:
```
export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=/data/local/tmp/split_sfg.txt
```
- 4、以上步骤完成后,运行的模型中符合条件的算子将被强制跑在CPU上。
### 举例
- 以模型[image](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)为例
- 1、可以使用netron查看模型
- 2、初步分析
- 下图是ssd_mobilenet_v1中的部分结构。其中红色部分暂时不支持在NPU上运行,蓝色部分可能NPU上的性能不理想。此时,如果直接让预测库自动调度的话,可能会分成多个子图,而且整体性能不佳。因此,可以将蓝色部分和绿色部分整体指定在CPU上运行,让其他部分自动运行在NPU上(红色部分会自动在CPU上运行)。
![](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png)
- 3、使用opt转换模型
- opt转换过程中会打印log信息。在log中搜索```digraph G```和```// end G```可以找到优化后的模型图。
![](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png)
![](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png)
- 将从```digraph G```开始的,到```// end G```结束的整段模型图信息,保存到```.dot```格式的文件中。可以用```graphviz```打开查看,或者在[网页版](http://dreampuf.github.io/GraphvizOnline/)查看。
![](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png)
- 在此处确认需要被指定的算子是否被优化了。(期望是被指定的算子都还独立存在,如果被融合为了一个算子,需要指定此时融合后的算子)。
- 4、写配置文件
- 在配置文件中指定可以支持NPU但是需要指定在CPU上运行的算子。
```
reshape
transpose
concat
softmax
```
- 由于这些算子都指定在CPU上运行,因此不需要特意配置算子的输入输出名称。
- 5、指定配置文件路径
- 通过```export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=your_split_config_file```的方式实现。
- 6、性能测试
- 设备:华为mate30 5G
- HIAI ddk版本:320
- 性能:CPU约71.8ms,NPU约16.6ms。
...@@ -55,7 +55,7 @@ Welcome to Paddle-Lite's documentation! ...@@ -55,7 +55,7 @@ Welcome to Paddle-Lite's documentation!
demo_guides/cuda demo_guides/cuda
demo_guides/opencl demo_guides/opencl
demo_guides/fpga demo_guides/fpga
demo_guides/npu demo_guides/huawei_kirin_npu
demo_guides/baidu_xpu demo_guides/baidu_xpu
demo_guides/rockchip_npu demo_guides/rockchip_npu
demo_guides/mediatek_apu demo_guides/mediatek_apu
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
**注意:本编译方法只适用于release/v2.6.0之后版本(包括 v2.6.0)** **注意:本编译方法只适用于release/v2.6.0之后版本(包括 v2.6.0)**
安装了Android的编译环境,可以下载并编译 Paddle-Lite源码 如果您还没有配置好Andriod交叉编译环境,请先根据[环境准备](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#id2)中的内容,根据您的开发环境安装编译Android预测库所需的编译环境。运行编译脚本之前,请先检查环变量`NDK_ROOT`指向正确的Andriod NDK安装路径,之后可以下载并编译 Paddle-Lite源码。
```shell ```shell
# 1. 下载Paddle-Lite源码 并切换到release分支 # 1. 下载Paddle-Lite源码 并切换到release分支
...@@ -14,6 +14,7 @@ cd Paddle-Lite && git checkout release/v2.3 ...@@ -14,6 +14,7 @@ cd Paddle-Lite && git checkout release/v2.3
./lite/tools/build_android.sh ./lite/tools/build_android.sh
``` ```
> **提示:** 编译过程中,如果程序在下载第三方库时花费较多时间,请尝试删除Paddle-Lite下的`<lite-repo>/third-party`目录之后再次运行编译脚本,脚本会自动下载存储于百度云的第三方库代码包,节省从git repo下载第三方库代码的时间。
### 编译结果 ### 编译结果
......
...@@ -3,10 +3,14 @@ ...@@ -3,10 +3,14 @@
opt是 x86 平台上的可执行文件,需要在PC端运行:支持Linux终端和Mac终端。 opt是 x86 平台上的可执行文件,需要在PC端运行:支持Linux终端和Mac终端。
### 帮助信息 ### 帮助信息
执行opt时不加入任何输入选项,会输出帮助信息,提示当前支持的选项:
执行opt时不加入任何输入选项,会输出帮助信息,提示当前支持的选项:
```bash ```bash
./opt ./opt
``` ```
> **注意:** 如果您是通过[准备opt](https://paddle-lite.readthedocs.io/zh/latest/user_guides/model_optimize_tool.html#id1)页面中,"方法二:下载opt可执行文件" 中提供的链接下载得到的opt可执行文件,请先通过`chmod +x ./opt`命令为下载的opt文件添加可执行权限。
![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png) ![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)
### 功能一:转化模型为Paddle-Lite格式 ### 功能一:转化模型为Paddle-Lite格式
......
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
### 2.3 配置校准数据生成器 ### 2.3 配置校准数据生成器
静态离线量化内部使用异步数据读取的方式读取校准数据,大家只需要根据模型的输入,配置读取数据的sample_generator。sample_generator是Python生成器,**必须每次返回单个样本数据**,会用作`DataLoader.set_sample_generator()`的数据源。 静态离线量化内部使用异步数据读取的方式读取校准数据,大家只需要根据模型的输入,配置读取数据的sample_generator。sample_generator是Python生成器,**必须每次返回单个样本数据**,会用作`DataLoader.set_sample_generator()`的数据源。
建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/use_py_reader.html)和本文示例,学习如何配置校准数据生成器。 建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/static_mode/use_py_reader.html)和本文示例,学习如何配置校准数据生成器。
### 2.4 调用静态离线量化 ### 2.4 调用静态离线量化
......
# 预编译库 # 预编译库下载
## 编译版本介绍 ## 编译版本介绍
......
# 模型转换工具 X2Paddle # 模型转换工具 X2Paddle
X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。 X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。目前支持版本为caffe 1.0;tensorflow 1.x,推荐1.4.0;ONNX 1.6.0,OpSet支持 9, 10, 11版本。
[X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。
支持的模型可参考**X2Paddle模型测试库:** 支持的模型可参考**X2Paddle模型测试库:**
......
...@@ -39,12 +39,16 @@ USE_MIR_PASS(identity_dropout_eliminate_pass); ...@@ -39,12 +39,16 @@ USE_MIR_PASS(identity_dropout_eliminate_pass);
USE_MIR_PASS(lite_conv_elementwise_fuse_pass); USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
USE_MIR_PASS(lite_conv_activation_fuse_pass); USE_MIR_PASS(lite_conv_activation_fuse_pass);
USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass); USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
USE_MIR_PASS(lite_match_matrix_activation_fuse_pass);
USE_MIR_PASS(lite_scales_fuse_pass);
USE_MIR_PASS(lite_sequence_reverse_embedding_fuse_pass);
USE_MIR_PASS(lite_elementwise_activation_fuse_pass); USE_MIR_PASS(lite_elementwise_activation_fuse_pass);
USE_MIR_PASS(lite_quant_dequant_fuse_pass); USE_MIR_PASS(lite_quant_dequant_fuse_pass);
USE_MIR_PASS(type_precision_cast_pass); USE_MIR_PASS(type_precision_cast_pass);
USE_MIR_PASS(type_layout_cast_pass); USE_MIR_PASS(type_layout_cast_pass);
USE_MIR_PASS(type_layout_cast_preprocess_pass); USE_MIR_PASS(type_layout_cast_preprocess_pass);
USE_MIR_PASS(memory_optimize_pass); USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(lite_reshape_fuse_pass);
USE_MIR_PASS(multi_stream_analysis_pass); USE_MIR_PASS(multi_stream_analysis_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(npu_subgraph_pass);
......
...@@ -9,6 +9,7 @@ if(WIN32) ...@@ -9,6 +9,7 @@ if(WIN32)
target_link_libraries(lite_pybind ${os_dependency_modules}) target_link_libraries(lite_pybind ${os_dependency_modules})
else() else()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
target_sources(lite_pybind PUBLIC ${__lite_cc_files})
endif(WIN32) endif(WIN32)
if (LITE_ON_TINY_PUBLISH) if (LITE_ON_TINY_PUBLISH)
......
...@@ -106,6 +106,42 @@ void conv_depthwise_3x3s1_int8(Dtype* dout, ...@@ -106,6 +106,42 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
int padh, int padh,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s1_int8_int8_impl(int8_t* dout,
const int8_t* din,
const int8_t* weights,
const float* scale,
const float* bias,
bool flag_bias,
int flag_act,
float* alpha,
int num,
int chin,
int hin,
int win,
int hout,
int wout,
int padw,
int padh,
ARMContext* ctx);
void conv_depthwise_3x3s1_int8_float_impl(float* dout,
const int8_t* din,
const int8_t* weights,
const float* scale,
const float* bias,
bool flag_bias,
int flag_act,
float* alpha,
int num,
int chin,
int hin,
int win,
int hout,
int wout,
int padw,
int padh,
ARMContext* ctx);
template <typename Dtype> template <typename Dtype>
void conv_depthwise_3x3s2_int8(Dtype* dout, void conv_depthwise_3x3s2_int8(Dtype* dout,
const int8_t* din, const int8_t* din,
...@@ -340,6 +376,118 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, ...@@ -340,6 +376,118 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
const int w_out, const int w_out,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s1p0_bias_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p0_bias_s_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p1_bias_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p1_bias_s_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s2p0_bias_s_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s2p1_bias_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s2p1_bias_s_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -841,24 +841,52 @@ void conv_depthwise_3x3_int8_fp32(const void* din, ...@@ -841,24 +841,52 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
alpha[3] = local_alpha; alpha[3] = local_alpha;
} }
} }
bool support_act_type = flag_act <= 1;
bool support_pad_type =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
(paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
bool support_width_type = w_in > 9 ? true : false;
if (stride == 1) { if (stride == 1) {
conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout), if (!support_act_type || !support_pad_type || !support_stride_type ||
reinterpret_cast<const int8_t*>(din), !support_width_type) {
reinterpret_cast<const int8_t*>(weights), conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
scale, reinterpret_cast<const int8_t*>(din),
bias, reinterpret_cast<const int8_t*>(weights),
flag_bias, scale,
flag_act, bias,
alpha, flag_bias,
num, flag_act,
ch_in, alpha,
h_in, num,
w_in, ch_in,
h_out, h_in,
w_out, w_in,
pad_w, h_out,
pad_h, w_out,
ctx); pad_w,
pad_h,
ctx);
} else {
conv_depthwise_3x3s1_int8_float_impl(
reinterpret_cast<float*>(dout),
reinterpret_cast<const int8_t*>(din),
reinterpret_cast<const int8_t*>(weights),
scale,
bias,
flag_bias,
flag_act,
alpha,
num,
ch_in,
h_in,
w_in,
h_out,
w_out,
pad_w,
pad_h,
ctx);
}
} else if (stride == 2) { } else if (stride == 2) {
conv_depthwise_3x3s2_int8(reinterpret_cast<float*>(dout), conv_depthwise_3x3s2_int8(reinterpret_cast<float*>(dout),
reinterpret_cast<const int8_t*>(din), reinterpret_cast<const int8_t*>(din),
...@@ -924,24 +952,52 @@ void conv_depthwise_3x3_int8_int8(const void* din, ...@@ -924,24 +952,52 @@ void conv_depthwise_3x3_int8_int8(const void* din,
alpha[3] = local_alpha; alpha[3] = local_alpha;
} }
} }
bool support_act_type = flag_act <= 1;
bool support_pad_type =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
(paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
bool support_width_type = w_in > 9 ? true : false;
if (stride == 1) { if (stride == 1) {
conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout), if (!support_act_type || !support_pad_type || !support_stride_type ||
reinterpret_cast<const int8_t*>(din), !support_width_type) {
reinterpret_cast<const int8_t*>(weights), conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
scale, reinterpret_cast<const int8_t*>(din),
bias, reinterpret_cast<const int8_t*>(weights),
flag_bias, scale,
flag_act, bias,
alpha, flag_bias,
num, flag_act,
ch_in, alpha,
h_in, num,
w_in, ch_in,
h_out, h_in,
w_out, w_in,
pad_w, h_out,
pad_h, w_out,
ctx); pad_w,
pad_h,
ctx);
} else {
conv_depthwise_3x3s1_int8_int8_impl(
reinterpret_cast<int8_t*>(dout),
reinterpret_cast<const int8_t*>(din),
reinterpret_cast<const int8_t*>(weights),
scale,
bias,
flag_bias,
flag_act,
alpha,
num,
ch_in,
h_in,
w_in,
h_out,
w_out,
pad_w,
pad_h,
ctx);
}
} else if (stride == 2) { } else if (stride == 2) {
conv_depthwise_3x3s2_int8(reinterpret_cast<int8_t*>(dout), conv_depthwise_3x3s2_int8(reinterpret_cast<int8_t*>(dout),
reinterpret_cast<const int8_t*>(din), reinterpret_cast<const int8_t*>(din),
......
...@@ -300,13 +300,15 @@ void fill_bias_act<float>(float* tensor, ...@@ -300,13 +300,15 @@ void fill_bias_act<float>(float* tensor,
switch (act_param->active_type) { switch (act_param->active_type) {
case lite_api::ActivationType::kRelu: case lite_api::ActivationType::kRelu:
for (int i = 0; i < remain; i++) { for (int i = 0; i < remain; i++) {
*dst = *src >= 0.f ? *src : 0.f; float tmp = (*src + bias_data);
*dst = tmp >= 0.f ? tmp : 0.f;
src++; src++;
dst++; dst++;
} }
case lite_api::ActivationType::kRelu6: case lite_api::ActivationType::kRelu6:
for (int i = 0; i < remain; i++) { for (int i = 0; i < remain; i++) {
float tmp = *src >= 0.f ? *src : 0.f; float tmp = (*src + bias_data);
tmp = tmp >= 0.f ? tmp : 0.f;
*dst = tmp <= act_param->Relu_clipped_coef *dst = tmp <= act_param->Relu_clipped_coef
? tmp ? tmp
: act_param->Relu_clipped_coef; : act_param->Relu_clipped_coef;
...@@ -315,10 +317,11 @@ void fill_bias_act<float>(float* tensor, ...@@ -315,10 +317,11 @@ void fill_bias_act<float>(float* tensor,
} }
case lite_api::ActivationType::kLeakyRelu: case lite_api::ActivationType::kLeakyRelu:
for (int i = 0; i < remain; i++) { for (int i = 0; i < remain; i++) {
if (*src >= 0.f) { float tmp = (*src + bias_data);
*dst = *src; if (tmp >= 0.f) {
*dst = tmp;
} else { } else {
*dst = *src * act_param->Leaky_relu_alpha; *dst = tmp * act_param->Leaky_relu_alpha;
} }
src++; src++;
dst++; dst++;
...@@ -336,17 +339,24 @@ void fill_bias_act<float>(float* tensor, ...@@ -336,17 +339,24 @@ void fill_bias_act<float>(float* tensor,
float32x4_t vbias = vdupq_n_f32(bias_data); float32x4_t vbias = vdupq_n_f32(bias_data);
float* src = data + j * channel_size; float* src = data + j * channel_size;
float* dst = data + j * channel_size; float* dst = data + j * channel_size;
if (cnt > 0) {
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile(FILL_BIAS FILL_STORE asm volatile(FILL_BIAS FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) :
: [vbias] "w"(vbias) [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: "memory", "cc", "v0", "v1", "v2", "v3"); : [vbias] "w"(vbias)
: "memory", "cc", "v0", "v1", "v2", "v3");
#else #else
asm volatile(FILL_BIAS FILL_STORE asm volatile(FILL_BIAS FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt) :
: [vbias] "w"(vbias) [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: "memory", "cc", "q3", "q4", "q5", "q6"); : [vbias] "w"(vbias)
: "memory", "cc", "q3", "q4", "q5", "q6");
#endif #endif
}
for (int i = 0; i < remain; i++) {
*dst = *src + bias_data;
}
} }
} }
} }
......
...@@ -2,4 +2,5 @@ if (NOT LITE_WITH_BM) ...@@ -2,4 +2,5 @@ if (NOT LITE_WITH_BM)
return() return()
endif() endif()
lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs}) add_library(target_wrapper_bm STATIC target_wrapper.cc)
target_link_libraries(target_wrapper_bm -Wl,-rpath,${BM_SDK_CPLIB_RPATH}:${BM_SDK_LIB_RPATH} -L${BM_SDK_CPLIB_RPATH} -L${BM_SDK_LIB_RPATH} -lbmcompiler -lbmcpu -lbmlib -lbmrt)
...@@ -23,12 +23,16 @@ lite_cc_library(mir_passes ...@@ -23,12 +23,16 @@ lite_cc_library(mir_passes
fusion/quant_dequant_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc
fusion/scale_activation_fuse_pass.cc fusion/scale_activation_fuse_pass.cc
fusion/reshape_fuse_pass.cc
fusion/__xpu__resnet_fuse_pass.cc fusion/__xpu__resnet_fuse_pass.cc
fusion/__xpu__resnet_cbam_fuse_pass.cc fusion/__xpu__resnet_cbam_fuse_pass.cc
fusion/__xpu__multi_encoder_fuse_pass.cc fusion/__xpu__multi_encoder_fuse_pass.cc
fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
fusion/__xpu__fc_fuse_pass.cc fusion/__xpu__fc_fuse_pass.cc
fusion/__xpu__mmdnn_fuse_pass.cc fusion/__xpu__mmdnn_fuse_pass.cc
fusion/match_matrix_activation_fuse_pass.cc
fusion/scales_fuse_pass.cc
fusion/sequence_reverse_embedding_fuse_pass.cc
elimination/identity_scale_eliminate_pass.cc elimination/identity_scale_eliminate_pass.cc
elimination/identity_dropout_eliminate_pass.cc elimination/identity_dropout_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc
......
...@@ -37,6 +37,18 @@ lite_cc_library(fuse_sequence_pool_concat ...@@ -37,6 +37,18 @@ lite_cc_library(fuse_sequence_pool_concat
lite_cc_library(fuse_scale_activation lite_cc_library(fuse_scale_activation
SRCS scale_activation_fuser.cc SRCS scale_activation_fuser.cc
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
lite_cc_library(fuse_reshape
SRCS reshape_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_match_matrix_activation
SRCS match_matrix_activation_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_scales
SRCS scales_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_sequence_reverse_embedding
SRCS sequence_reverse_embedding_fuser.cc
DEPS pattern_matcher_high_api)
set(mir_fusers set(mir_fusers
fuse_fc fuse_fc
...@@ -52,6 +64,10 @@ set(mir_fusers ...@@ -52,6 +64,10 @@ set(mir_fusers
fuse_interpolate fuse_interpolate
fuse_sequence_pool_concat fuse_sequence_pool_concat
fuse_scale_activation fuse_scale_activation
fuse_reshape
fuse_match_matrix_activation
fuse_scales
fuse_sequence_reverse_embedding
CACHE INTERNAL "fusers") CACHE INTERNAL "fusers")
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
...@@ -104,9 +104,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -104,9 +104,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
auto conv_weight_t = auto conv_weight_t =
scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>(); scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
auto groups = conv_op_desc->GetAttr<int>("groups"); auto groups = conv_op_desc->GetAttr<int>("groups");
bool depthwise = false;
if (conv_type_ == "conv2d_transpose") { if (conv_type_ == "conv2d_transpose") {
depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()), CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[1] * groups)) static_cast<size_t>(conv_weight_t->dims()[1] * groups))
<< "The BN bias's size should be equal to the size of the first " << "The BN bias's size should be equal to the size of the first "
...@@ -120,7 +118,6 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -120,7 +118,6 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
size_t weight_num = conv_weight_t->data_size(); size_t weight_num = conv_weight_t->data_size();
bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits"); bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
// comupte BN alpha and beta // comupte BN alpha and beta
Tensor alpha_tensor, beta_tensor; Tensor alpha_tensor, beta_tensor;
alpha_tensor.CopyDataFrom(*bn_bias_t); alpha_tensor.CopyDataFrom(*bn_bias_t);
...@@ -162,12 +159,13 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -162,12 +159,13 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
auto conv_weight_d = conv_weight_t->mutable_data<int8_t>(); auto conv_weight_d = conv_weight_t->mutable_data<int8_t>();
// compute new conv_weight for int8 // compute new conv_weight for int8
auto weight_scale = conv_op_desc->GetInputScale(weight_name); auto weight_scale = conv_op_desc->GetInputScale(weight_name);
if (conv_type_ == "conv2d_transpose" && !depthwise) { if (conv_type_ == "conv2d_transpose") {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int cout = conv_weight_t->dims()[1] * groups;
conv_weight_t->dims()[3]; int cin_group = conv_weight_t->dims()[0] / groups;
int c_size = cout * conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (int k = 0; k < conv_weight_t->dims()[0]; ++k) { for (int k = 0; k < cin_group; ++k) {
for (int i = 0; i < h; ++i) { for (int i = 0; i < cout; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]); weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) { if (alpha_data[i] < 0.f) {
auto ptr_row = conv_weight_d + k * c_size + i * hw; auto ptr_row = conv_weight_d + k * c_size + i * hw;
...@@ -203,12 +201,13 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -203,12 +201,13 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} else { } else {
// compute new conv_weight // compute new conv_weight
auto conv_weight_d = conv_weight_t->mutable_data<float>(); auto conv_weight_d = conv_weight_t->mutable_data<float>();
if (conv_type_ == "conv2d_transpose" && !depthwise) { if (conv_type_ == "conv2d_transpose") {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int cout = conv_weight_t->dims()[1] * groups;
conv_weight_t->dims()[3]; int cin_group = conv_weight_t->dims()[0] / groups;
int c_size = cout * conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (int k = 0; k < conv_weight_t->dims()[0]; ++k) { for (int k = 0; k < cin_group; ++k) {
for (int i = 0; i < h; ++i) { for (int i = 0; i < cout; ++i) {
auto ptr_row = conv_weight_d + k * c_size + i * hw; auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (int j = 0; j < hw; ++j) { for (int j = 0; j < hw; ++j) {
ptr_row[j] *= alpha_data[i]; ptr_row[j] *= alpha_data[i];
......
...@@ -23,7 +23,7 @@ namespace lite { ...@@ -23,7 +23,7 @@ namespace lite {
namespace mir { namespace mir {
void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) { void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
#ifdef LITE_WITH_X86 #if defined(LITE_WITH_X86) || defined(LITE_WITH_CUDA)
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
fusion::FcFuser fuser(false); fusion::FcFuser fuser(false);
fuser(graph.get()); fuser(graph.get());
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/match_matrix_activation_fuse_pass.h"
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/match_matrix_activation_fuser.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void MatchMatrixActFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fusion::MatchMatrixActFuser fuser("relu");
fuser(graph.get());
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_match_matrix_activation_fuse_pass,
paddle::lite::mir::MatchMatrixActFusePass)
.BindTargets({TARGET(kCUDA)});
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
class MatchMatrixActFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/match_matrix_activation_fuser.h"
#include <memory>
#include <vector>
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
void MatchMatrixActFuser::BuildPattern() {
// create nodes.
auto* x = VarNode("x")->assert_is_op_input("match_matrix_tensor", "X");
auto* W = VarNode("W")->assert_is_op_input("match_matrix_tensor", "W");
auto* y = VarNode("y")->assert_is_op_input("match_matrix_tensor", "Y");
auto* mm = OpNode("match_matrix_tensor", "match_matrix_tensor");
auto* mm_out =
VarNode("mm_out")->assert_is_op_output("match_matrix_tensor", "Out");
auto* mm_tmp =
VarNode("mm_tmp")->assert_is_op_output("match_matrix_tensor", "Tmp");
auto* act = OpNode("act", activation_);
auto* out = VarNode("Out")->assert_is_op_output(activation_, "Out");
// create topology.
std::vector<PMNode*> mm_inputs{x, W, y};
std::vector<PMNode*> mm_ouputs{mm_out, mm_tmp};
mm_inputs >> *mm >> mm_ouputs;
// Some op specialities.
mm_out->AsIntermediate();
mm->AsIntermediate();
act->AsIntermediate();
*mm_out >> *act >> *out;
}
void MatchMatrixActFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched);
auto mm_op = LiteOpRegistry::Global().Create("match_matrix_tensor");
auto mm = matched.at("match_matrix_tensor")->stmt()->op();
auto* scope = mm->scope();
auto& valid_places = mm->valid_places();
mm_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(mm_op, valid_places);
IR_NODE_LINK_TO(matched.at("x"), new_op_node);
IR_NODE_LINK_TO(matched.at("W"), new_op_node);
IR_NODE_LINK_TO(matched.at("y"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
}
cpp::OpDesc MatchMatrixActFuser::GenOpDesc(const key2nodes_t& matched) {
auto op_desc = *matched.at("match_matrix_tensor")->stmt()->op_info();
int dim_t = matched.at("match_matrix_tensor")
->stmt()
->op_info()
->GetAttr<int>("dim_t");
op_desc.mutable_inputs()->clear();
op_desc.mutable_outputs()->clear();
op_desc.SetType("match_matrix_tensor");
op_desc.SetInput("X", {matched.at("x")->arg()->name});
op_desc.SetInput("W", {matched.at("W")->arg()->name});
op_desc.SetInput("Y", {matched.at("y")->arg()->name});
op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
op_desc.SetOutput("Tmp", {matched.at("mm_tmp")->arg()->name});
op_desc.SetAttr("dim_t", dim_t);
op_desc.SetAttr("fuse_relu", true);
return op_desc;
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -38,6 +38,7 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute. ...@@ -38,6 +38,7 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.
add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps}) add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda}) add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda})
add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps}) add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_reverse_embedding_compute_cuda CUDA extra SRCS sequence_reverse_embedding_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps}) add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册