Merge branch 'develop' into conv_dw_5x5

7cccc85c · HappyAngel · GitHub · 247f0c7f · 392f3215 · 7cccc85c
127 changed file
--- a/cmake/bm.cmake
+++ b/cmake/bm.cmake
@@ -23,6 +23,9 @@ if(NOT DEFINED BM_SDK_ROOT)
    endif()
 endif()
+set(BM_SDK_CPLIB_RPATH ${BM_SDK_ROOT}/lib/bmcompiler)
+set(BM_SDK_LIB_RPATH ${BM_SDK_ROOT}/lib/bmnn/pcie)
 message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}")
 find_path(BM_SDK_INC NAMES bmruntime_interface.h
  PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH)
@@ -37,43 +40,35 @@ include_directories("${BM_SDK_ROOT}/include/bmcpu")
 include_directories("${BM_SDK_ROOT}/include/bmlog")
 find_library(BM_SDK_RT_LIB NAMES bmrt
-  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+  PATHS ${BM_SDK_LIB_RPATH})
 if(NOT BM_SDK_RT_LIB)
  message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}")
 else()
  message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}")
-  add_library(bmrt SHARED IMPORTED GLOBAL)
-  set_property(TARGET bmrt PROPERTY IMPORTED_LOCATION ${BM_SDK_RT_LIB})
 endif()
 find_library(BM_SDK_BM_LIB NAMES bmlib
-  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+  PATHS ${BM_SDK_LIB_RPATH})
 if(NOT BM_SDK_BM_LIB)
  message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}")
 else()
  message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}")
-  add_library(bmlib SHARED IMPORTED GLOBAL)
-  set_property(TARGET bmlib PROPERTY IMPORTED_LOCATION ${BM_SDK_BM_LIB})
 endif()
 find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler
-  PATHS ${BM_SDK_ROOT}/lib/bmcompiler)
+  PATHS ${BM_SDK_CPLIB_RPATH})
 if(NOT BM_SDK_COMPILER_LIB)
  message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}")
 else()
  message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}")
-  add_library(bmcompiler SHARED IMPORTED GLOBAL)
-  set_property(TARGET bmcompiler PROPERTY IMPORTED_LOCATION ${BM_SDK_COMPILER_LIB})
 endif()
 find_library(BM_SDK_CPU_LIB NAMES bmcpu
-  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+  PATHS ${BM_SDK_LIB_RPATH})
 if(NOT BM_SDK_CPU_LIB)
  message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}")
 else()
  message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}")
-  add_library(bmcpu SHARED IMPORTED GLOBAL)
-  set_property(TARGET bmcpu PROPERTY IMPORTED_LOCATION ${BM_SDK_CPU_LIB})
 endif()
 set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs")

--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
@@ -44,6 +44,8 @@ sh run_benchmark.sh
 3. 自动执行另一个脚本`benchmark.sh`（多台手机连接USB，请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`）；
 4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`，到当前目录，并显示Benchmark结果。
+> **注意：** 如果运行中遇到`Operation not permitted`的问题，请使用`sudo +sh run_benchmark.sh`给予授权，并尝试重新关闭/打开手机**USB调试**和**文件传输模式**，或者通过USB重新连接手机之后再次运行脚本。
 ## 二. 逐步Benchmark
 ### 1. 编译benchmark可执行文件

--- a/docs/demo_guides/android_app_demo.md
+++ b/docs/demo_guides/android_app_demo.md
@@ -36,9 +36,11 @@
 **需要的环境**： Android Studio、Android手机（开启USB调试模式）、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
+**预先要求**：如果您的Android Studio尚未配置NDK，请根据Android Studio用户指南中的[安装及配置NDK和CMake](https://developer.android.com/studio/projects/install-ndk)内容，预先配置好NDK。您可以选择最新的NDK版本，或者与[Android编译环境配置](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#android)中的NDK版本保持一致。
 **部署步骤**：
-1、 目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo`
+1、目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo`
 2、用Android Studio 打开object_detection_demo工程 （本步骤需要联网）。
@@ -46,12 +48,17 @@
 ![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png)
+**注意：** 如果您在导入项目、编译或者运行过程中遇到NDK配置错误的提示，请打开 File > Project Structure > SDK Location，修改 "Andriod NDK location" 为您本机配置的NDK所在路径。如果您是通过Andriod Studio的SDK Tools下载的NDK (见本章节"预先要求")，可以直接点击下拉框选择默认路径。如果以上步骤仍旧无法解决NDK配置错误，请尝试根据Andriod Studio官方文档中的[更新 Android Gradle 插件](https://developer.android.com/studio/releases/gradle-plugin?hl=zh-cn#updating-plugin)章节，尝试更新Android Gradle plugin版本。
+<p align="center"><img width="600" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Andriod_Studio_NDK.png"/>
 4、按下 Run按钮，自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型，需要联网)
 成功后效果如下，图一：APP安装到手机        图二： APP打开后的效果，会自动识别图片中的物体并标记
 <p align="center"><img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp0.png"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp1.jpg"/></p>
 ## Android demo结构讲解
 Android 示例的代码结构如下图所示：

--- a/docs/demo_guides/huawei_kirin_npu.md
+++ b/docs/demo_guides/huawei_kirin_npu.md
--- a/docs/demo_guides/npu.md
+++ b/docs/demo_guides/npu.md
-# PaddleLite使用NPU(华为)预测部署
-Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭载的NPU）的预测框架。
-原理是在线分析Paddle模型，将Paddle算子转成HiAI IR后，调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。
-## 已支持的设备
- 华为nova5、nova5i pro、mate30、mate30 pro、mate30 5G、荣耀v30、p40、p40 pro，以及即将推出的mate40、。据华为透露，今后上市的大部分手机都会搭载其自研达芬奇架构NPU。
-## 已支持的模型
- MobileNetV1
- MobileNetV2
- ResNet-18/50
- ShuffleNetV2
- squeezenet
- mnasnet
- yolov3
- CycleGAN (暂时需要华为内部rom的支持)
- 百度内部业务模型（由于涉密，不方便透露具体细节）
-*CPU/NPU混合调度在部分模型可以获得更佳的性能*
-## 已支持（或部分支持）的Paddle算子
- sigmoid
- relu
- tanh
- relu_clipped
- leaky_relu
- softsign
- hard_sigmoid
- batch_norm
- concat
- conv2d
- depthwise_conv2d
- conv2d_transpose
- dropout
- elementwise_add
- elementwise_sub
- elementwise_mul
- elementwise_div
- fusion_elementwise_add_activation
- fusion_elementwise_sub_activation
- fusion_elementwise_mul_activation
- fusion_elementwise_div_activation
- fc
- bilinear_interp
- nearest_interp
- matmul
- mul
- pad2d
- pool2d
- reduce_mean
- reshape
- reshape2
- scale
- shuffle_channel
- softmax
- split
- sqrt
- square
- transpose
- transpose2
- unsqueeze
- unsqueeze2
- instance_norm (暂时需要华为内部rom的支持)
- layer_norm (暂时需要华为内部rom的支持)
-## 编译支持NPU的Paddle Lite库
- 从[华为HiAI平台](https://developer.huawei.com/consumer/cn/hiai)下载华为HiAI DDK后解压到任意路径（注意：华为提供了多个版本的DDK，我们需要下载针对麒麟810/990芯片HiAI Foundation开发套件，例如[DDK V310版本](https://obs.cn-north-2.myhwclouds.com/hms-ds-wf/sdk/hwhiai-ddk-100.310.011.010.zip)）。
- 将HiAI DDK中的ai_ddk_lib目录拷贝至Paddle Lite源码根目录后，使用[编译脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_android.sh)编译 (需要指定NPU相关选项)。
-注意：以下是HiAI DDK V310版解压后的目录结构，需要将ai_ddk_lib目录拷贝至Paddle Lite源码根目录。
-```shell
- app_sample
- ddk
-  - ai_ddk_lib
-    - include
-    - lib # for armv7
-    - lib64 # for armv8
- document
- tools
-```
- 推荐编译命令。由于HiAI DDK的so库均基于c++_shared构建，因此，建议使用c++_shared编译Paddle Lite。
-```shell
-# huawei_kirin_npu_sdk_root 需要指向 ai_ddk_lib 的路径
-$ ./lite/tools/build_android.sh --android_stl=c++_shared --with_huawei_kirin_npu=ON --huawei_kirin_npu_sdk_root=<path-to-ai_ddk_lib>
-# 其它选项可以通过 "./lite/tools/build_android.sh help" 查看，例如arm版本等 
-```
-注意：为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置，然后再执行上述命令。
-## 优化生成NPU模型
- model_optimize_tool工具已经支持生成NPU模型，仅需要将valid_targets设置为npu,arm即可，具体参考[模型转化方法](../user_guides/model_optimize_tool)。
-```shell
-./model_optimize_tool --model_dir=<model_param_dir> \
-    --model_file=<model_path> \
-    --param_file=<param_path> \
-    --optimize_out_type=(protobuf|naive_buffer) \
-    --optimize_out=<output_optimize_model_dir> \
-    --valid_targets=npu,arm \
-    --record_tailoring_info =(true|false)
-```
- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子，并没有真正生成NPU HiAI模型，只有在执行时才会将标记的Paddle算子转成HiAI IR，最终生成并执行HiAI模型，具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。
- 不同模型，不同型号（ROM版本）的华为手机，在执行阶段，由于某些Paddle算子无法完全转成HiAI IR，或目标手机的HiAI版本过低等原因，可能导致HiAI模型无法成功生成，在这种情况下，Paddle Lite会调用CPU版算子进行运算完成整个预测任务。
-## 通过JAVA接口加载并执行NPU模型
-**注意：由于华为手机root权限限制，现在仅支持JAVA接口加载和执行NPU模型**
- 使用方法和[Java实例](java_demo)一致，无需额外设置任何参数，只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。
-注意：在拷贝libpaddle_lite_jni.so的时候，由于依赖HiAI DDK so和libc++_shared.so库，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，拷到libpaddle_lite_jni.so同级目录下。
-## 其它说明
- 华为达芬奇架构的NPU内部大量采用float16进行运算，因此，预测结果会存在偏差，但大部分情况下精度不会有较大损失，可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。
- 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU，与Kirin 970/980 Soc搭载的寒武纪NPU不一样，同样的，与Hi3559A、Hi3519A使用的NNIE也不一样，Paddle Lite只支持华为自研达芬奇架构NPU。
- 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter，以便适配更多Paddle模型，同时华为研发同学也在持续对HiAI IR性能进行优化。
-## 手动分割子图
-### 背景
- Paddle-Lite已经支持了大量的华为NPU的算子，但是仍然不能满足所有模型的需求。对于一个有部分算子不支持的模型，Paddle-Lite会将模型划分为可以跑在NPU上的子图和跑在CPU上的子图，实现NPU和CPU自动调度的功能，通常情况下可以获得比较好的性能。在一些特殊情况下，模型会被自动划分为比较多的子图，导致CPU和NPU的切换开销很大，从而导致整体性能变差。因此，需要手动分割子图的功能来指定一些算子跑在CPU上，避免子图过多。
-### 功能
- 通过配置文件来指定需要强制跑在CPU上的算子
-### 使用方法
- 1、通过netron打开paddle模型文件，可以查看模型结构，获得算子的类型、输入名称。输出名称。
-    - 注意：Paddle-Lite会对模型进行优化，模型算子可以改变，需要以优化后的模型算子为准。后面会举例说明。
- 2、生成配置文件 ```split_cfg.txt```，记录需要跑在CPU上的算子信息。
-    - 每行一条OP记录信息，以冒号":"分隔"op名称"，"op输入名"，"op输出名"，以逗号","分隔"op输入名"和"op输出名"中的不同var名。
-    - 可以部分省略输入或者输出名。比如：```op3:in3_var0```表示，指定类型为"op3"，输入为"in3_var0"的算子；```op4```表示所有类型为"op4"的算子
-    - 例子1：
-    ```
-    op0:in0_var0,in0_var1:out0_var0,out0_var1
-    op1:in1_var0,in1_var1:out1_var0
-    op2::out2_var0
-    op3:in3_var0
-    op4
-    ```
-    - 例子2：
-    ```
-    transpose:conv2d_22.tmp_1:transpose_0.tmp_0
-    ```
-    ![image](https://user-images.githubusercontent.com/50474132/80475316-4a5fda80-897b-11ea-910a-6aee13243387.png)
- 3、使用环境变量```SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE```指定配置文件的位置。
-    - 例如：
-    ```
-    export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=/data/local/tmp/split_sfg.txt
-    ```
- 4、以上步骤完成后，运行的模型中符合条件的算子将被强制跑在CPU上。
-### 举例
- 以模型[image](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)为例
- 1、可以使用netron查看模型
- 2、初步分析
-    - 下图是ssd_mobilenet_v1中的部分结构。其中红色部分暂时不支持在NPU上运行，蓝色部分可能NPU上的性能不理想。此时，如果直接让预测库自动调度的话，可能会分成多个子图，而且整体性能不佳。因此，可以将蓝色部分和绿色部分整体指定在CPU上运行，让其他部分自动运行在NPU上(红色部分会自动在CPU上运行)。
-    ![](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png)
- 3、使用opt转换模型
-    - opt转换过程中会打印log信息。在log中搜索```digraph G```和```// end G```可以找到优化后的模型图。
-    ![](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png)
-    ![](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png)
-    - 将从```digraph G```开始的，到```// end G```结束的整段模型图信息，保存到```.dot```格式的文件中。可以用```graphviz```打开查看，或者在[网页版](http://dreampuf.github.io/GraphvizOnline/)查看。
-    ![](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png)
-    - 在此处确认需要被指定的算子是否被优化了。(期望是被指定的算子都还独立存在，如果被融合为了一个算子，需要指定此时融合后的算子)。
- 4、写配置文件
-    - 在配置文件中指定可以支持NPU但是需要指定在CPU上运行的算子。
-    ```
-    reshape
-    transpose
-    concat
-    softmax
-    ```
-    - 由于这些算子都指定在CPU上运行，因此不需要特意配置算子的输入输出名称。
- 5、指定配置文件路径
-    - 通过```export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=your_split_config_file```的方式实现。
- 6、性能测试
-    - 设备：华为mate30 5G
-    - HIAI ddk版本：320
-    - 性能：CPU约71.8ms，NPU约16.6ms。
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -55,7 +55,7 @@ Welcome to Paddle-Lite's documentation!
  demo_guides/cuda
  demo_guides/opencl
  demo_guides/fpga
-  demo_guides/npu
+  demo_guides/huawei_kirin_npu
  demo_guides/baidu_xpu
  demo_guides/rockchip_npu
  demo_guides/mediatek_apu

--- a/docs/user_guides/Compile/Android.md
+++ b/docs/user_guides/Compile/Android.md
@@ -3,7 +3,7 @@
 **注意：本编译方法只适用于release/v2.6.0之后版本（包括 v2.6.0)**
-安装了Android的编译环境，可以下载并编译 Paddle-Lite源码
+如果您还没有配置好Andriod交叉编译环境，请先根据[环境准备](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#id2)中的内容，根据您的开发环境安装编译Android预测库所需的编译环境。运行编译脚本之前，请先检查环变量`NDK_ROOT`指向正确的Andriod NDK安装路径，之后可以下载并编译 Paddle-Lite源码。
 ```shell
 # 1. 下载Paddle-Lite源码 并切换到release分支
@@ -14,6 +14,7 @@ cd Paddle-Lite && git checkout release/v2.3
 ./lite/tools/build_android.sh
 ```
+> **提示：** 编译过程中，如果程序在下载第三方库时花费较多时间，请尝试删除Paddle-Lite下的`<lite-repo>/third-party`目录之后再次运行编译脚本，脚本会自动下载存储于百度云的第三方库代码包，节省从git repo下载第三方库代码的时间。
 ### 编译结果

--- a/docs/user_guides/opt/opt_bin.md
+++ b/docs/user_guides/opt/opt_bin.md
@@ -3,10 +3,14 @@
 opt是 x86 平台上的可执行文件，需要在PC端运行：支持Linux终端和Mac终端。
 ### 帮助信息
- 执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
+执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
 ```bash
 ./opt
 ```
+> **注意：** 如果您是通过[准备opt](https://paddle-lite.readthedocs.io/zh/latest/user_guides/model_optimize_tool.html#id1)页面中，"方法二：下载opt可执行文件" 中提供的链接下载得到的opt可执行文件，请先通过`chmod +x ./opt`命令为下载的opt文件添加可执行权限。
 ![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)
 ### 功能一：转化模型为Paddle-Lite格式

--- a/docs/user_guides/post_quant_with_data.md
+++ b/docs/user_guides/post_quant_with_data.md
@@ -38,7 +38,7 @@
 ### 2.3 配置校准数据生成器
 静态离线量化内部使用异步数据读取的方式读取校准数据，大家只需要根据模型的输入，配置读取数据的sample_generator。sample_generator是Python生成器，**必须每次返回单个样本数据**，会用作`DataLoader.set_sample_generator()`的数据源。
-建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/use_py_reader.html)和本文示例，学习如何配置校准数据生成器。
+建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/static_mode/use_py_reader.html)和本文示例，学习如何配置校准数据生成器。
 ### 2.4 调用静态离线量化

--- a/docs/user_guides/release_lib.md
+++ b/docs/user_guides/release_lib.md
-# 预编译库
+# 预编译库下载
 ## 编译版本介绍

--- a/docs/user_guides/x2paddle.md
+++ b/docs/user_guides/x2paddle.md
 # 模型转换工具 X2Paddle
-X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。
+X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。目前支持版本为caffe 1.0；tensorflow 1.x，推荐1.4.0；ONNX 1.6.0，OpSet支持 9, 10, 11版本。
 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。
 支持的模型可参考**X2Paddle模型测试库：**

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -39,12 +39,16 @@ USE_MIR_PASS(identity_dropout_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
 USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
+USE_MIR_PASS(lite_match_matrix_activation_fuse_pass);
+USE_MIR_PASS(lite_scales_fuse_pass);
+USE_MIR_PASS(lite_sequence_reverse_embedding_fuse_pass);
 USE_MIR_PASS(lite_elementwise_activation_fuse_pass);
 USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(type_layout_cast_preprocess_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(lite_reshape_fuse_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);

--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -9,6 +9,7 @@ if(WIN32)
   target_link_libraries(lite_pybind ${os_dependency_modules})
 else()
   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+   target_sources(lite_pybind PUBLIC ${__lite_cc_files})
 endif(WIN32)
 if (LITE_ON_TINY_PUBLISH)

--- a/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -106,6 +106,42 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                               int padh,
                               ARMContext* ctx);
+void conv_depthwise_3x3s1_int8_int8_impl(int8_t* dout,
+                                         const int8_t* din,
+                                         const int8_t* weights,
+                                         const float* scale,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         int flag_act,
+                                         float* alpha,
+                                         int num,
+                                         int chin,
+                                         int hin,
+                                         int win,
+                                         int hout,
+                                         int wout,
+                                         int padw,
+                                         int padh,
+                                         ARMContext* ctx);
+void conv_depthwise_3x3s1_int8_float_impl(float* dout,
+                                          const int8_t* din,
+                                          const int8_t* weights,
+                                          const float* scale,
+                                          const float* bias,
+                                          bool flag_bias,
+                                          int flag_act,
+                                          float* alpha,
+                                          int num,
+                                          int chin,
+                                          int hin,
+                                          int win,
+                                          int hout,
+                                          int wout,
+                                          int padw,
+                                          int padh,
+                                          ARMContext* ctx);
 template <typename Dtype>
 void conv_depthwise_3x3s2_int8(Dtype* dout,
                               const int8_t* din,
@@ -340,6 +376,118 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
                                        const int w_out,
                                        ARMContext* ctx);
+void conv_depthwise_3x3s1p0_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+void conv_depthwise_3x3s1p0_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
+void conv_depthwise_3x3s1p1_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+void conv_depthwise_3x3s1p1_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
+void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+void conv_depthwise_3x3s2p0_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
+void conv_depthwise_3x3s2p1_bias_no_relu(float* dout,
+                                         const float* din,
+                                         const float* weights,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         bool flag_relu,
+                                         const int num,
+                                         const int ch_in,
+                                         const int h_in,
+                                         const int w_in,
+                                         const int h_out,
+                                         const int w_out,
+                                         ARMContext* ctx);
+void conv_depthwise_3x3s2p1_bias_s_no_relu(float* dout,
+                                           const float* din,
+                                           const float* weights,
+                                           const float* bias,
+                                           bool flag_bias,
+                                           bool flag_relu,
+                                           const int num,
+                                           const int ch_in,
+                                           const int h_in,
+                                           const int w_in,
+                                           const int h_out,
+                                           const int w_out,
+                                           ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -841,24 +841,52 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
      alpha[3] = local_alpha;
    }
  }
+  bool support_act_type = flag_act <= 1;
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
+  bool support_width_type = w_in > 9 ? true : false;
  if (stride == 1) {
-    conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
-                              reinterpret_cast<const int8_t*>(din),
+        !support_width_type) {
-                              reinterpret_cast<const int8_t*>(weights),
+      conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
-                              scale,
+                                reinterpret_cast<const int8_t*>(din),
-                              bias,
+                                reinterpret_cast<const int8_t*>(weights),
-                              flag_bias,
+                                scale,
-                              flag_act,
+                                bias,
-                              alpha,
+                                flag_bias,
-                              num,
+                                flag_act,
-                              ch_in,
+                                alpha,
-                              h_in,
+                                num,
-                              w_in,
+                                ch_in,
-                              h_out,
+                                h_in,
-                              w_out,
+                                w_in,
-                              pad_w,
+                                h_out,
-                              pad_h,
+                                w_out,
-                              ctx);
+                                pad_w,
+                                pad_h,
+                                ctx);
+    } else {
+      conv_depthwise_3x3s1_int8_float_impl(
+          reinterpret_cast<float*>(dout),
+          reinterpret_cast<const int8_t*>(din),
+          reinterpret_cast<const int8_t*>(weights),
+          scale,
+          bias,
+          flag_bias,
+          flag_act,
+          alpha,
+          num,
+          ch_in,
+          h_in,
+          w_in,
+          h_out,
+          w_out,
+          pad_w,
+          pad_h,
+          ctx);
+    }
  } else if (stride == 2) {
    conv_depthwise_3x3s2_int8(reinterpret_cast<float*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -924,24 +952,52 @@ void conv_depthwise_3x3_int8_int8(const void* din,
      alpha[3] = local_alpha;
    }
  }
+  bool support_act_type = flag_act <= 1;
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
+  bool support_width_type = w_in > 9 ? true : false;
  if (stride == 1) {
-    conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
-                              reinterpret_cast<const int8_t*>(din),
+        !support_width_type) {
-                              reinterpret_cast<const int8_t*>(weights),
+      conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
-                              scale,
+                                reinterpret_cast<const int8_t*>(din),
-                              bias,
+                                reinterpret_cast<const int8_t*>(weights),
-                              flag_bias,
+                                scale,
-                              flag_act,
+                                bias,
-                              alpha,
+                                flag_bias,
-                              num,
+                                flag_act,
-                              ch_in,
+                                alpha,
-                              h_in,
+                                num,
-                              w_in,
+                                ch_in,
-                              h_out,
+                                h_in,
-                              w_out,
+                                w_in,
-                              pad_w,
+                                h_out,
-                              pad_h,
+                                w_out,
-                              ctx);
+                                pad_w,
+                                pad_h,
+                                ctx);
+    } else {
+      conv_depthwise_3x3s1_int8_int8_impl(
+          reinterpret_cast<int8_t*>(dout),
+          reinterpret_cast<const int8_t*>(din),
+          reinterpret_cast<const int8_t*>(weights),
+          scale,
+          bias,
+          flag_bias,
+          flag_act,
+          alpha,
+          num,
+          ch_in,
+          h_in,
+          w_in,
+          h_out,
+          w_out,
+          pad_w,
+          pad_h,
+          ctx);
+    }
  } else if (stride == 2) {
    conv_depthwise_3x3s2_int8(reinterpret_cast<int8_t*>(dout),
                              reinterpret_cast<const int8_t*>(din),

--- a/lite/backends/arm/math/fill_bias_relu.cc
+++ b/lite/backends/arm/math/fill_bias_relu.cc
@@ -300,13 +300,15 @@ void fill_bias_act<float>(float* tensor,
      switch (act_param->active_type) {
        case lite_api::ActivationType::kRelu:
          for (int i = 0; i < remain; i++) {
-            *dst = *src >= 0.f ? *src : 0.f;
+            float tmp = (*src + bias_data);
+            *dst = tmp >= 0.f ? tmp : 0.f;
            src++;
            dst++;
          }
        case lite_api::ActivationType::kRelu6:
          for (int i = 0; i < remain; i++) {
-            float tmp = *src >= 0.f ? *src : 0.f;
+            float tmp = (*src + bias_data);
+            tmp = tmp >= 0.f ? tmp : 0.f;
            *dst = tmp <= act_param->Relu_clipped_coef
                       ? tmp
                       : act_param->Relu_clipped_coef;
@@ -315,10 +317,11 @@ void fill_bias_act<float>(float* tensor,
          }
        case lite_api::ActivationType::kLeakyRelu:
          for (int i = 0; i < remain; i++) {
-            if (*src >= 0.f) {
+            float tmp = (*src + bias_data);
-              *dst = *src;
+            if (tmp >= 0.f) {
+              *dst = tmp;
            } else {
-              *dst = *src * act_param->Leaky_relu_alpha;
+              *dst = tmp * act_param->Leaky_relu_alpha;
            }
            src++;
            dst++;
@@ -336,17 +339,24 @@ void fill_bias_act<float>(float* tensor,
      float32x4_t vbias = vdupq_n_f32(bias_data);
      float* src = data + j * channel_size;
      float* dst = data + j * channel_size;
+      if (cnt > 0) {
 #ifdef __aarch64__
-      asm volatile(FILL_BIAS FILL_STORE
+        asm volatile(FILL_BIAS FILL_STORE
-                   : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                     :
-                   : [vbias] "w"(vbias)
+                     [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
-                   : "memory", "cc", "v0", "v1", "v2", "v3");
+                     : [vbias] "w"(vbias)
+                     : "memory", "cc", "v0", "v1", "v2", "v3");
 #else
-      asm volatile(FILL_BIAS FILL_STORE
+        asm volatile(FILL_BIAS FILL_STORE
-                   : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                     :
-                   : [vbias] "w"(vbias)
+                     [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
-                   : "memory", "cc", "q3", "q4", "q5", "q6");
+                     : [vbias] "w"(vbias)
+                     : "memory", "cc", "q3", "q4", "q5", "q6");
 #endif
+      }
+      for (int i = 0; i < remain; i++) {
+        *dst = *src + bias_data;
+      }
    }
  }
 }

--- a/lite/backends/bm/CMakeLists.txt
+++ b/lite/backends/bm/CMakeLists.txt
@@ -2,4 +2,5 @@ if (NOT LITE_WITH_BM)
    return()
 endif()
-lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs})
+add_library(target_wrapper_bm STATIC target_wrapper.cc)
+target_link_libraries(target_wrapper_bm -Wl,-rpath,${BM_SDK_CPLIB_RPATH}:${BM_SDK_LIB_RPATH} -L${BM_SDK_CPLIB_RPATH} -L${BM_SDK_LIB_RPATH} -lbmcompiler -lbmcpu -lbmlib -lbmrt)
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -23,12 +23,16 @@ lite_cc_library(mir_passes
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
      fusion/scale_activation_fuse_pass.cc
+      fusion/reshape_fuse_pass.cc
      fusion/__xpu__resnet_fuse_pass.cc
      fusion/__xpu__resnet_cbam_fuse_pass.cc
      fusion/__xpu__multi_encoder_fuse_pass.cc
      fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
      fusion/__xpu__fc_fuse_pass.cc
      fusion/__xpu__mmdnn_fuse_pass.cc
+      fusion/match_matrix_activation_fuse_pass.cc
+      fusion/scales_fuse_pass.cc
+      fusion/sequence_reverse_embedding_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc

--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -37,6 +37,18 @@ lite_cc_library(fuse_sequence_pool_concat
 lite_cc_library(fuse_scale_activation
        SRCS scale_activation_fuser.cc
        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_reshape
+        SRCS reshape_fuser.cc
+        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_match_matrix_activation
+        SRCS match_matrix_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_scales
+        SRCS scales_fuser.cc
+        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_sequence_reverse_embedding
+        SRCS sequence_reverse_embedding_fuser.cc
+        DEPS pattern_matcher_high_api)
 set(mir_fusers
    fuse_fc
@@ -52,6 +64,10 @@ set(mir_fusers
    fuse_interpolate
    fuse_sequence_pool_concat
    fuse_scale_activation
+    fuse_reshape
+    fuse_match_matrix_activation
+    fuse_scales
+    fuse_sequence_reverse_embedding
    CACHE INTERNAL "fusers")
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -104,9 +104,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  auto conv_weight_t =
      scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
  auto groups = conv_op_desc->GetAttr<int>("groups");
-  bool depthwise = false;
  if (conv_type_ == "conv2d_transpose") {
-    depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
             static_cast<size_t>(conv_weight_t->dims()[1] * groups))
        << "The BN bias's size should be equal to the size of the first "
@@ -120,7 +118,6 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  size_t weight_num = conv_weight_t->data_size();
  bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
  bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
  // comupte BN alpha and beta
  Tensor alpha_tensor, beta_tensor;
  alpha_tensor.CopyDataFrom(*bn_bias_t);
@@ -162,12 +159,13 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
    auto conv_weight_d = conv_weight_t->mutable_data<int8_t>();
    // compute new conv_weight for int8
    auto weight_scale = conv_op_desc->GetInputScale(weight_name);
-    if (conv_type_ == "conv2d_transpose" && !depthwise) {
+    if (conv_type_ == "conv2d_transpose") {
-      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
+      int cout = conv_weight_t->dims()[1] * groups;
-                   conv_weight_t->dims()[3];
+      int cin_group = conv_weight_t->dims()[0] / groups;
+      int c_size = cout * conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+      for (int k = 0; k < cin_group; ++k) {
-        for (int i = 0; i < h; ++i) {
+        for (int i = 0; i < cout; ++i) {
          weight_scale[i] *= fabsf(alpha_data[i]);
          if (alpha_data[i] < 0.f) {
            auto ptr_row = conv_weight_d + k * c_size + i * hw;
@@ -203,12 +201,13 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  } else {
    // compute new conv_weight
    auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    if (conv_type_ == "conv2d_transpose" && !depthwise) {
+    if (conv_type_ == "conv2d_transpose") {
-      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
+      int cout = conv_weight_t->dims()[1] * groups;
-                   conv_weight_t->dims()[3];
+      int cin_group = conv_weight_t->dims()[0] / groups;
+      int c_size = cout * conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+      for (int k = 0; k < cin_group; ++k) {
-        for (int i = 0; i < h; ++i) {
+        for (int i = 0; i < cout; ++i) {
          auto ptr_row = conv_weight_d + k * c_size + i * hw;
          for (int j = 0; j < hw; ++j) {
            ptr_row[j] *= alpha_data[i];

--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -23,7 +23,7 @@ namespace lite {
 namespace mir {
 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-#ifdef LITE_WITH_X86
+#if defined(LITE_WITH_X86) || defined(LITE_WITH_CUDA)
 #ifdef LITE_WITH_MLU
  fusion::FcFuser fuser(false);
  fuser(graph.get());

--- a/lite/core/mir/fusion/match_matrix_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/match_matrix_activation_fuse_pass.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/match_matrix_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/match_matrix_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+void MatchMatrixActFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  fusion::MatchMatrixActFuser fuser("relu");
+  fuser(graph.get());
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_match_matrix_activation_fuse_pass,
+                  paddle::lite::mir::MatchMatrixActFusePass)
+    .BindTargets({TARGET(kCUDA)});
--- a/lite/core/mir/fusion/match_matrix_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/match_matrix_activation_fuse_pass.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+class MatchMatrixActFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/match_matrix_activation_fuser.cc
+++ b/lite/core/mir/fusion/match_matrix_activation_fuser.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/match_matrix_activation_fuser.h"
+#include <memory>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+void MatchMatrixActFuser::BuildPattern() {
+  // create nodes.
+  auto* x = VarNode("x")->assert_is_op_input("match_matrix_tensor", "X");
+  auto* W = VarNode("W")->assert_is_op_input("match_matrix_tensor", "W");
+  auto* y = VarNode("y")->assert_is_op_input("match_matrix_tensor", "Y");
+  auto* mm = OpNode("match_matrix_tensor", "match_matrix_tensor");
+  auto* mm_out =
+      VarNode("mm_out")->assert_is_op_output("match_matrix_tensor", "Out");
+  auto* mm_tmp =
+      VarNode("mm_tmp")->assert_is_op_output("match_matrix_tensor", "Tmp");
+  auto* act = OpNode("act", activation_);
+  auto* out = VarNode("Out")->assert_is_op_output(activation_, "Out");
+  // create topology.
+  std::vector<PMNode*> mm_inputs{x, W, y};
+  std::vector<PMNode*> mm_ouputs{mm_out, mm_tmp};
+  mm_inputs >> *mm >> mm_ouputs;
+  // Some op specialities.
+  mm_out->AsIntermediate();
+  mm->AsIntermediate();
+  act->AsIntermediate();
+  *mm_out >> *act >> *out;
+}
+void MatchMatrixActFuser::InsertNewNode(SSAGraph* graph,
+                                        const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto mm_op = LiteOpRegistry::Global().Create("match_matrix_tensor");
+  auto mm = matched.at("match_matrix_tensor")->stmt()->op();
+  auto* scope = mm->scope();
+  auto& valid_places = mm->valid_places();
+  mm_op->Attach(op_desc, scope);
+  auto* new_op_node = graph->GraphCreateInstructNode(mm_op, valid_places);
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("y"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+}
+cpp::OpDesc MatchMatrixActFuser::GenOpDesc(const key2nodes_t& matched) {
+  auto op_desc = *matched.at("match_matrix_tensor")->stmt()->op_info();
+  int dim_t = matched.at("match_matrix_tensor")
+                  ->stmt()
+                  ->op_info()
+                  ->GetAttr<int>("dim_t");
+  op_desc.mutable_inputs()->clear();
+  op_desc.mutable_outputs()->clear();
+  op_desc.SetType("match_matrix_tensor");
+  op_desc.SetInput("X", {matched.at("x")->arg()->name});
+  op_desc.SetInput("W", {matched.at("W")->arg()->name});
+  op_desc.SetInput("Y", {matched.at("y")->arg()->name});
+  op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+  op_desc.SetOutput("Tmp", {matched.at("mm_tmp")->arg()->name});
+  op_desc.SetAttr("dim_t", dim_t);
+  op_desc.SetAttr("fuse_relu", true);
+  return op_desc;
+}
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/match_matrix_activation_fuser.h
+++ b/lite/core/mir/fusion/match_matrix_activation_fuser.h
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
--- a/lite/core/mir/fusion/reshape_fuse_pass.cc
+++ b/lite/core/mir/fusion/reshape_fuse_pass.cc
--- a/lite/core/mir/fusion/reshape_fuse_pass.h
+++ b/lite/core/mir/fusion/reshape_fuse_pass.h
--- a/lite/core/mir/fusion/reshape_fuser.cc
+++ b/lite/core/mir/fusion/reshape_fuser.cc
--- a/lite/core/mir/fusion/reshape_fuser.h
+++ b/lite/core/mir/fusion/reshape_fuser.h
--- a/lite/core/mir/fusion/scales_fuse_pass.cc
+++ b/lite/core/mir/fusion/scales_fuse_pass.cc
--- a/lite/core/mir/fusion/scales_fuse_pass.h
+++ b/lite/core/mir/fusion/scales_fuse_pass.h
--- a/lite/core/mir/fusion/scales_fuser.cc
+++ b/lite/core/mir/fusion/scales_fuser.cc
--- a/lite/core/mir/fusion/scales_fuser.h
+++ b/lite/core/mir/fusion/scales_fuser.h
--- a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
--- a/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
--- a/lite/core/mir/fusion/sequence_pool_concat_fuser.h
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.h
--- a/lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.cc
+++ b/lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.cc
--- a/lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.h
+++ b/lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.h
--- a/lite/core/mir/fusion/sequence_reverse_embedding_fuser.cc
+++ b/lite/core/mir/fusion/sequence_reverse_embedding_fuser.cc
--- a/lite/core/mir/fusion/sequence_reverse_embedding_fuser.h
+++ b/lite/core/mir/fusion/sequence_reverse_embedding_fuser.h
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -38,6 +38,7 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.
 add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda})
 add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_reverse_embedding_compute_cuda CUDA extra SRCS sequence_reverse_embedding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})

--- a/lite/kernels/cuda/match_matrix_tensor_compute.cu
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
--- a/lite/kernels/cuda/sequence_mask_compute.cu
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
--- a/lite/kernels/cuda/sequence_mask_compute.h
+++ b/lite/kernels/cuda/sequence_mask_compute.h
--- a/lite/kernels/cuda/sequence_reverse_embedding_compute.cu
+++ b/lite/kernels/cuda/sequence_reverse_embedding_compute.cu
--- a/lite/kernels/cuda/sequence_reverse_embedding_compute.h
+++ b/lite/kernels/cuda/sequence_reverse_embedding_compute.h
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/one_hot_compute.cc
+++ b/lite/kernels/host/one_hot_compute.cc
--- a/lite/kernels/host/one_hot_compute.h
+++ b/lite/kernels/host/one_hot_compute.h
--- a/lite/kernels/host/one_hot_compute_test.cc
+++ b/lite/kernels/host/one_hot_compute_test.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
--- a/lite/kernels/huawei_ascend_npu/bridges/cast_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/cast_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/elementwise_ops.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/flatten_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/flatten_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/gather_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/gather_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/layer_norm_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/layer_norm_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/huawei_ascend_npu/bridges/reshape_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/reshape_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/slice_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/slice_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/transpose_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/transpose_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h
--- a/lite/model_parser/CMakeLists.txt
+++ b/lite/model_parser/CMakeLists.txt
--- a/lite/model_parser/base/vector_view.h
+++ b/lite/model_parser/base/vector_view.h
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
--- a/lite/model_parser/compatible_pb_test.cc
+++ b/lite/model_parser/compatible_pb_test.cc
--- a/lite/model_parser/flatbuffers/block_desc.cc
+++ b/lite/model_parser/flatbuffers/block_desc.cc
--- a/lite/model_parser/flatbuffers/block_desc.h
+++ b/lite/model_parser/flatbuffers/block_desc.h
--- a/lite/model_parser/flatbuffers/io.cc
+++ b/lite/model_parser/flatbuffers/io.cc
--- a/lite/model_parser/flatbuffers/io.h
+++ b/lite/model_parser/flatbuffers/io.h
--- a/lite/model_parser/flatbuffers/io_test.cc
+++ b/lite/model_parser/flatbuffers/io_test.cc
--- a/lite/model_parser/flatbuffers/op_desc.cc
+++ b/lite/model_parser/flatbuffers/op_desc.cc
--- a/lite/model_parser/flatbuffers/op_desc.h
+++ b/lite/model_parser/flatbuffers/op_desc.h
--- a/lite/model_parser/flatbuffers/param_desc.h
+++ b/lite/model_parser/flatbuffers/param_desc.h
--- a/lite/model_parser/flatbuffers/program_desc.cc
+++ b/lite/model_parser/flatbuffers/program_desc.cc
--- a/lite/model_parser/flatbuffers/program_desc.h
+++ b/lite/model_parser/flatbuffers/program_desc.h
--- a/lite/model_parser/flatbuffers/program_desc_test.cc
+++ b/lite/model_parser/flatbuffers/program_desc_test.cc
--- a/lite/model_parser/flatbuffers/test_helper.h
+++ b/lite/model_parser/flatbuffers/test_helper.h
--- a/lite/model_parser/flatbuffers/var_desc.h
+++ b/lite/model_parser/flatbuffers/var_desc.h
--- a/lite/model_parser/flatbuffers/vector_view.h
+++ b/lite/model_parser/flatbuffers/vector_view.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/model_parser.h
+++ b/lite/model_parser/model_parser.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
--- a/lite/operators/one_hot_op.cc
+++ b/lite/operators/one_hot_op.cc
--- a/lite/operators/one_hot_op.h
+++ b/lite/operators/one_hot_op.h
--- a/lite/operators/one_hot_op_test.cc
+++ b/lite/operators/one_hot_op_test.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/sequence_pool_op.cc
+++ b/lite/operators/sequence_pool_op.cc
--- a/lite/operators/sequence_reverse_embedding_op.cc
+++ b/lite/operators/sequence_reverse_embedding_op.cc
--- a/lite/operators/sequence_reverse_embedding_op.h
+++ b/lite/operators/sequence_reverse_embedding_op.h
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
--- a/lite/tests/kernels/flatten_compute_test.cc
+++ b/lite/tests/kernels/flatten_compute_test.cc
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
--- a/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
+++ b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
--- a/lite/tests/kernels/search_seq_fc_compute_test.cc
+++ b/lite/tests/kernels/search_seq_fc_compute_test.cc
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
--- a/lite/tools/check_api_approvals.sh
+++ b/lite/tools/check_api_approvals.sh